From ba837fc7e4403bf8653bd164446609022f011366 Mon Sep 17 00:00:00 2001
From: root <root@dc-smc-25.amd.com>
Date: Mon, 20 Apr 2020 02:24:48 +0000
Subject: [PATCH 01/18] add igemm bwd v4r1 xdlops kernel

---
 src/CMakeLists.txt                            |   1 +
 src/include/miopen/solver.hpp                 |  87 +++
 ...plicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp | 434 +++++++++++
 ...plicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp | 143 ++++
 src/mlo_dir_conv.cpp                          |   3 +-
 src/ocl/convolutionocl.cpp                    |   4 +-
 src/solver.cpp                                |   3 +
 ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 688 ++++++++++++++++++
 8 files changed, 1361 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 src/CMakeLists.txt
 mode change 100644 => 100755 src/include/miopen/solver.hpp
 create mode 100755 src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100755 src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
 mode change 100644 => 100755 src/mlo_dir_conv.cpp
 mode change 100644 => 100755 src/ocl/convolutionocl.cpp
 mode change 100644 => 100755 src/solver.cpp
 create mode 100755 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
old mode 100644
new mode 100755
index c39ac496d6..853b403ae8
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -216,6 +216,7 @@ set( MIOpen_Source
     solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
     solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
     solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
+    solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
     )
 
 list(APPEND MIOpen_Source tmp_dir.cpp binary_cache.cpp md5.cpp)
diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
old mode 100644
new mode 100755
index bdddb8919b..fef33514ba
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -668,6 +668,70 @@ struct PerformanceImplicitGemmBwdDataV4R1 : Serializable<PerformanceImplicitGemm
     std::string ToString() const;
 };
 
+struct PerformanceImplicitGemmBwdDataV4R1Xdlops
+    : Serializable<PerformanceImplicitGemmBwdDataV4R1Xdlops>
+{
+    int GemmNPerBlock; // 2^n[8..16]
+    int GemmMPerBlock; // 2^n[32..128]
+    int GemmKPerBlock; // 2^n[4..16]
+
+    int GemmMPerWave;
+    int GemmNPerWave;
+
+    int InBlockCopyClusterLengths_GemmK; // 2^n[4..16]
+    int InBlockCopyClusterLengths_GemmN; // 2^n[8..64]
+
+    int WeiBlockCopyClusterLengths_GemmK; // 2^n[1..4]
+    int WeiBlockCopyClusterLengths_GemmM; // 2^n[16..128]
+
+    bool use_spare_set;
+
+    PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, int, int, int, int, bool);
+
+    PerformanceImplicitGemmBwdDataV4R1Xdlops()
+        : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, -1, -1, -1, -1, false)
+    {
+    }
+
+    PerformanceImplicitGemmBwdDataV4R1Xdlops(
+        int a, int b, int c, int d, int e, int f, int g, int h, int i)
+        : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, f, g, h, i, false)
+    {
+    }
+
+    PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare);
+
+    bool operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const;
+
+    template <class Self, class F>
+    static void Visit(Self&& self, F f)
+    {
+        f(self.GemmNPerBlock, "GemmNPerBlock");
+        f(self.GemmMPerBlock, "GemmMPerBlock");
+        f(self.GemmKPerBlock, "GemmKPerBlock");
+        f(self.GemmMPerWave, "GemmMPerWave");
+        f(self.GemmNPerWave, "GemmNPerWave");
+        f(self.InBlockCopyClusterLengths_GemmK, "InBlockCopyClusterLengths_GemmK");
+        f(self.InBlockCopyClusterLengths_GemmN, "InBlockCopyClusterLengths_GemmN");
+        f(self.WeiBlockCopyClusterLengths_GemmK, "WeiBlockCopyClusterLengths_GemmK");
+        f(self.WeiBlockCopyClusterLengths_GemmM, "WeiBlockCopyClusterLengths_GemmM");
+    }
+
+    std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, bool>
+    CalculateBlockGemmPerformanceParameters(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, bool>
+    CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, bool>
+    CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
+    // std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
+    bool IsValidValue() const;
+    bool IsValid(const ConvolutionContext& ctx) const;
+    void EuristicInit(const ConvolutionContext& ctx);
+    bool SetNextValue();
+    std::string ToString() const;
+};
+
 struct ConvHipImplicitGemmV4R1Fwd : SolverBase<ConvolutionContext>
 {
     PerformanceImplicitGemmV4R1 GetPerformanceConfig(const ConvolutionContext& ctx) const;
@@ -997,6 +1061,29 @@ struct ConvHipImplicitGemmBwdDataV4R1 : SolverBase<ConvolutionContext>
                              bool disableConfigOverrideFromEnv = false) const;
 };
 
+struct ConvHipImplicitGemmBwdDataV4R1Xdlops : SolverBase<ConvolutionContext>
+{
+    static int CalculateNumberOfGemm(const ConvolutionContext& ctx);
+    static std::tuple<int, int, int> CalculateGemmSize(const ConvolutionContext& ctx, int gemm_id);
+    PerformanceImplicitGemmBwdDataV4R1Xdlops
+    GetPerformanceConfig(const ConvolutionContext& ctx) const;
+    bool IsValidPerformanceConfig(const ConvolutionContext& ctx,
+                                  const PerformanceImplicitGemmBwdDataV4R1Xdlops& c) const;
+    bool IsApplicable(const ConvolutionContext& ctx) const;
+    ConvSolution GetSolution(const ConvolutionContext& ctx,
+                             const PerformanceImplicitGemmBwdDataV4R1Xdlops& config,
+                             bool disableConfigOverrideFromEnv = false) const;
+    PerformanceImplicitGemmBwdDataV4R1Xdlops Search(const ConvolutionContext&) const;
+    int RunAndMeasureSolution(miopen::Handle& profile_h,
+                              ConstData_t bot_buf,
+                              Data_t top_buf,
+                              ConstData_t wei_buf,
+                              ConstData_t bias_buf,
+                              const ConvolutionContext& ctx,
+                              const ConvSolution& solution,
+                              float& elapsed_time) const;
+};
+
 struct ConvHipImplicitGemmBwdDataV1R1Xdlops : SolverBase<ConvolutionContext>
 {
     PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100755
index 0000000000..789a24c68f
--- /dev/null
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,434 @@
+#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_NCHW_KCYX_NKHW_HPP
+#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops.hpp"
+
+namespace ck {
+
+// Number of GEMMs: YTilda * XTilda
+// GemmM = C
+// GemmN = N * HTildaSlice * WTildaSlice
+// GemmK = K * YDotSlice * XDotSlice
+template <index_t GridSize,
+          index_t BlockSize,
+          typename Float,
+          typename AccFloat,
+          typename InGlobalDesc,
+          typename WeiGlobalDesc,
+          typename OutGlobalDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmMPerBlock,
+          index_t GemmNPerBlock,
+          index_t GemmKPerBlock,
+          index_t GemmMPerWave,
+          index_t GemmNPerWave,
+          index_t GemmThreadGemmDataPerReadM,
+          index_t GemmThreadGemmDataPerReadN,
+          typename GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
+          typename GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
+          index_t GemmABlockCopySrcDataPerRead_GemmM,
+          index_t GemmABlockCopyDstDataPerWrite_GemmM,
+          typename GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
+          typename GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
+          index_t GemmBBlockCopySrcDataPerRead_GemmN,
+          index_t GemmBBlockCopyDstDataPerWrite_GemmN>
+struct GridwiseConvolutionBackwardDataImplicitGemm_v4r1_xdlops_nchw_kcyx_nkhw
+{
+    __host__ __device__ static constexpr index_t GetNumberOfGemm()
+    {
+        constexpr index_t ConvStrideH = ConvStrides{}[0];
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationH = ConvDilations{}[0];
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+        constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+        constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+        constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;
+
+        return YTilda * XTilda;
+    }
+
+    __host__ __device__ static constexpr auto GetGemmSizeImpl(index_t iYTilda, index_t iXTilda)
+    {
+        constexpr index_t N  = InGlobalDesc::GetLengths()[0];
+        constexpr index_t C  = InGlobalDesc::GetLengths()[1];
+        constexpr index_t Hi = InGlobalDesc::GetLengths()[2];
+        constexpr index_t Wi = InGlobalDesc::GetLengths()[3];
+
+        constexpr index_t K  = OutGlobalDesc::GetLengths()[1];
+        constexpr index_t Ho = OutGlobalDesc::GetLengths()[2];
+        constexpr index_t Wo = OutGlobalDesc::GetLengths()[3];
+
+        constexpr index_t Y = WeiGlobalDesc::GetLengths()[2];
+        constexpr index_t X = WeiGlobalDesc::GetLengths()[3];
+
+        constexpr index_t ConvStrideH = ConvStrides{}[0];
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationH = ConvDilations{}[0];
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+        constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+        constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+        constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;
+
+        constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
+        constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);
+
+        constexpr index_t HTilda =
+            Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
+        constexpr index_t WTilda =
+            Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);
+
+        // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+        constexpr index_t iHTildaLeft = math::integer_divide_floor(
+            math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]);
+        constexpr index_t iWTildaLeft = math::integer_divide_floor(
+            math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]);
+
+        constexpr index_t iHTildaRight = math::min(
+            HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
+        constexpr index_t iWTildaRight = math::min(
+            WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);
+
+        constexpr index_t HTildaSlice = iHTildaRight - iHTildaLeft;
+        constexpr index_t WTildaSlice = iWTildaRight - iWTildaLeft;
+
+        // GemmM and GemmN
+        constexpr index_t GemmM = C;
+        constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;
+
+        // GemmK is different for each GEMM
+        index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
+        index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+
+        index_t GemmK = K * YDotSlice * XDotSlice;
+
+        return Array<index_t, 3>{GemmM, GemmN, GemmK};
+    }
+
+    __host__ __device__ static constexpr auto GetGemmSize(index_t gemm_id)
+    {
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+        constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;
+
+        index_t iYTilda = gemm_id / XTilda;
+        index_t iXTilda = gemm_id % XTilda;
+
+        return GetGemmSizeImpl(iYTilda, iXTilda);
+    }
+
+    template <index_t iYTilda, index_t iXTilda>
+    __device__ static void RunImpl(Float* __restrict__ p_in_global,
+                                   const Float* __restrict__ p_wei_global,
+                                   const Float* __restrict__ p_out_global)
+    {
+        constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
+        constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
+        constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
+
+        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLengths()[0];
+        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLengths()[1];
+        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2];
+        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3];
+
+        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLengths()[1];
+        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2];
+        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3];
+
+        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
+        constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
+
+        constexpr index_t ConvStrideH = ConvStrides{}[0];
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationH = ConvDilations{}[0];
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        //\todo static_assert for global vector load/store
+        // statc_assert();
+
+        constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+        constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+        constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+        constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;
+
+        constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
+        constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);
+
+        constexpr index_t HTilda =
+            Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
+        constexpr index_t WTilda =
+            Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);
+
+        // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+        constexpr index_t iHTildaLeft = math::integer_divide_floor(
+            math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]);
+        constexpr index_t iWTildaLeft = math::integer_divide_floor(
+            math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]);
+
+        constexpr index_t iHTildaRight = math::min(
+            HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
+        constexpr index_t iWTildaRight = math::min(
+            WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);
+
+        constexpr index_t HTildaSlice = iHTildaRight - iHTildaLeft;
+        constexpr index_t WTildaSlice = iWTildaRight - iWTildaLeft;
+
+        // weight out-of-bound check can be skipped
+        constexpr bool wei_skip_out_of_bound_check = true;
+
+        // weight tensor
+        constexpr auto wei_k_c_ydot_ytilda_xdot_xtilda_global_desc = transform_tensor_descriptor(
+            wei_k_c_y_x_global_desc,
+            make_tuple(PassThrough<K>{},
+                       PassThrough<C>{},
+                       Embed<Y,
+                             Sequence<YDot, YTilda>,
+                             Sequence<ConvStrideH / GcdStrideDilationH, 1, 0>,
+                             wei_skip_out_of_bound_check>{},
+                       Embed<X,
+                             Sequence<XDot, XTilda>,
+                             Sequence<ConvStrideW / GcdStrideDilationW, 1, 0>,
+                             wei_skip_out_of_bound_check>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
+        constexpr bool out_skip_out_of_bound_check = false;
+#else
+        //\todo sometimes output tensor out-of-bound check can be skipped, find out all such
+        // situations
+        constexpr bool out_skip_out_of_bound_check = true;
+#endif
+
+        // output tensor
+        constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor(
+            out_n_k_ho_wo_global_desc,
+            make_tuple(PassThrough<N>{},
+                       PassThrough<K>{},
+                       Embed<Ho,
+                             Sequence<YDot, HTilda>,
+                             Sequence<-ConvDilationH / GcdStrideDilationH, 1, 0>,
+                             out_skip_out_of_bound_check>{},
+                       Embed<Wo,
+                             Sequence<XDot, WTilda>,
+                             Sequence<-ConvDilationW / GcdStrideDilationW, 1, 0>,
+                             out_skip_out_of_bound_check>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        constexpr auto out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc =
+            transform_tensor_descriptor(
+                out_n_k_ydot_htilda_xdot_wtilda_global_desc,
+                make_tuple(PassThrough<N>{},
+                           PassThrough<K>{},
+                           PassThrough<YTilda>{},
+                           PassThrough<XTilda>{},
+                           Slice<Sequence<HTilda, WTilda>,
+                                 Sequence<iHTildaLeft, iWTildaLeft>,
+                                 Sequence<iHTildaRight, iWTildaRight>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}));
+
+#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
+        constexpr bool in_skip_out_of_bound_check = false;
+#else
+        //\todo sometimes input out-of-bound check can be skipped, find out all such situations
+        constexpr bool in_skip_out_of_bound_check = true;
+#endif
+
+        // input tensor
+        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(
+                PassThrough<N>{},
+                PassThrough<C>{},
+                Pad<Sequence<Hi, Wi>, InLeftPads, InRightPads, in_skip_out_of_bound_check>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+
+        constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2];
+        constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3];
+
+        constexpr auto in_n_c_ytilda_htilda_xtilda_wtilda_global_desc = transform_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(PassThrough<N>{},
+                       PassThrough<C>{},
+                       Embed<Hip,
+                             Sequence<YTilda, HTilda>,
+                             Sequence<ConvDilationH, ConvStrideH, 0>,
+                             in_skip_out_of_bound_check>{},
+                       Embed<Wip,
+                             Sequence<XTilda, WTilda>,
+                             Sequence<ConvDilationW, ConvStrideW, 0>,
+                             in_skip_out_of_bound_check>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        constexpr auto in_n_c_ytilda_htildaslice_xtilda_wtildaslice_global_desc =
+            transform_tensor_descriptor(
+                in_n_c_ytilda_htilda_xtilda_wtilda_global_desc,
+                make_tuple(PassThrough<N>{},
+                           PassThrough<C>{},
+                           PassThrough<YTilda>{},
+                           PassThrough<XTilda>{},
+                           Slice<Sequence<HTilda, WTilda>,
+                                 Sequence<iHTildaLeft, iWTildaLeft>,
+                                 Sequence<iHTildaRight, iWTildaRight>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}));
+
+        // GEMM
+        constexpr index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
+        constexpr index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+
+        // A matrix
+        constexpr auto wei_k_c_ydotslice_ytidaslice_xdotslice_xtildaslice_global_desc =
+            transform_tensor_descriptor(
+                wei_k_c_ydot_ytilda_xdot_xtilda_global_desc,
+                make_tuple(
+                    PassThrough<K>{},
+                    PassThrough<C>{},
+                    Slice<Sequence<YDot, XDot>, Sequence<0, 0>, Sequence<YDotSlice, XDotSlice>>{},
+                    Slice<Sequence<YTilda, XTilda>,
+                          Sequence<iYTilda, iXTilda>,
+                          Sequence<iYTilda + 1, iXTilda + 1>>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3, 5>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3, 5>{}));
+
+        constexpr auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+            wei_k_c_ydotslice_ytidaslice_xdotslice_xtildaslice_global_desc,
+            make_tuple(Merge<Sequence<K, YDotSlice, XDotSlice>>{}, Merge<Sequence<C, 1, 1>>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // B matrix
+        constexpr auto out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc =
+            transform_tensor_descriptor(
+                out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc,
+                make_tuple(
+                    PassThrough<N>{},
+                    PassThrough<K>{},
+                    PassThrough<HTildaSlice>{},
+                    PassThrough<WTildaSlice>{},
+                    Slice<Sequence<YDot, XDot>, Sequence<0, 0>, Sequence<YDotSlice, XDotSlice>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}));
+
+        constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor(
+            out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc,
+            make_tuple(Merge<Sequence<K, YDotSlice, XDotSlice>>{},
+                       Merge<Sequence<N, HTildaSlice, WTildaSlice>>{}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // C matrix
+        constexpr auto in_n_c_ytildaslice_htildaslice_xtildaslice_wtildaslice_global_desc =
+            transform_tensor_descriptor(
+                in_n_c_ytilda_htildaslice_xtilda_wtildaslice_global_desc,
+                make_tuple(PassThrough<N>{},
+                           PassThrough<C>{},
+                           PassThrough<HTildaSlice>{},
+                           PassThrough<WTildaSlice>{},
+                           Slice<Sequence<YTilda, XTilda>,
+                                 Sequence<iYTilda, iXTilda>,
+                                 Sequence<iYTilda + 1, iXTilda + 1>>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}));
+
+        constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+            in_n_c_ytildaslice_htildaslice_xtildaslice_wtildaslice_global_desc,
+            make_tuple(Merge<Sequence<C, 1, 1>>{}, Merge<Sequence<N, HTildaSlice, WTildaSlice>>{}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        constexpr auto gridwise_gemm = GridwiseGemmTransposedANormalBNormalCXdlops_v1<
+            GridSize,
+            BlockSize,
+            Float,
+            AccFloat,
+            decltype(wei_gemmk_gemmm_global_desc),
+            decltype(out_gemmk_gemmn_global_desc),
+            decltype(in_gemmm_gemmn_global_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmThreadGemmDataPerReadM,
+            GemmThreadGemmDataPerReadN,
+            GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
+            GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
+            Sequence<1, 0>,
+            Sequence<1, 0>,
+            Sequence<0, 1>,
+            1,
+            GemmABlockCopySrcDataPerRead_GemmM,
+            GemmABlockCopyDstDataPerWrite_GemmM,
+            GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
+            GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
+            Sequence<0, 1>,
+            Sequence<0, 1>,
+            Sequence<0, 1>,
+            1,
+            GemmBBlockCopySrcDataPerRead_GemmN,
+            GemmBBlockCopyDstDataPerWrite_GemmN,
+            InMemoryDataOperation::Set>{};
+
+        gridwise_gemm.Run(p_wei_global, p_out_global, p_in_global);
+    }
+
+    template <index_t GemmId>
+    __device__ static void Run(Float* __restrict__ p_in_global,
+                               const Float* __restrict__ p_wei_global,
+                               const Float* __restrict__ p_out_global)
+    {
+        constexpr index_t ConvStrideH = ConvStrides{}[0];
+        constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+        constexpr index_t ConvDilationH = ConvDilations{}[0];
+        constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+        constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+        constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+        constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+        constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;
+
+        constexpr index_t iYTilda = GemmId / XTilda;
+        constexpr index_t iXTilda = GemmId % XTilda;
+
+        static_assert(iYTilda < YTilda && iXTilda < XTilda, "wrong! iYtilda, iXtilda");
+
+        RunImpl<iYTilda, iXTilda>(p_in_global, p_wei_global, p_out_global);
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
new file mode 100755
index 0000000000..61a54a0c46
--- /dev/null
+++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,143 @@
+#include "common_header.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp"
+#include "float_types.h"
+
+extern "C" __global__
+    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw(
+        const FLOAT* const __restrict__ p_out_global,
+        const FLOAT* const __restrict__ p_wei_global,
+        FLOAT* const __restrict__ p_in_global)
+{
+    using namespace ck;
+
+    // read problem parameters
+    constexpr index_t N  = CK_PARAM_PROBLEM_N;
+    constexpr index_t K  = CK_PARAM_PROBLEM_K;
+    constexpr index_t C  = CK_PARAM_PROBLEM_C;
+    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
+    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
+    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
+    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
+    constexpr index_t Y  = CK_PARAM_PROBLEM_Y;
+    constexpr index_t X  = CK_PARAM_PROBLEM_X;
+
+    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
+    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
+
+    constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
+    constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
+
+    constexpr index_t InLeftPadH = CK_PARAM_PROBLEM_IN_LEFT_PAD_H;
+    constexpr index_t InLeftPadW = CK_PARAM_PROBLEM_IN_LEFT_PAD_W;
+
+    constexpr index_t InRightPadH = CK_PARAM_PROBLEM_IN_RIGHT_PAD_H;
+    constexpr index_t InRightPadW = CK_PARAM_PROBLEM_IN_RIGHT_PAD_W;
+
+    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
+    constexpr index_t GridSize  = CK_PARAM_DEPENDENT_GRID_SIZE;
+
+    constexpr index_t GemmMPerBlock = CK_PARAM_TUNABLE_GEMM_M_PER_BLOCK;
+    constexpr index_t GemmNPerBlock = CK_PARAM_TUNABLE_GEMM_N_PER_BLOCK;
+    constexpr index_t GemmKPerBlock = CK_PARAM_TUNABLE_GEMM_K_PER_BLOCK;
+
+    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, Hi, Wi>{});
+    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    constexpr auto out_nkhw_desc = make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
+
+    using ConvStrides   = Sequence<ConvStrideH, ConvStrideW>;
+    using ConvDilations = Sequence<ConvDilationH, ConvDilationW>;
+
+    using InLeftPads  = Sequence<InLeftPadH, InLeftPadW>;
+    using InRightPads = Sequence<InRightPadH, InRightPadW>;
+
+    // A matrix
+    constexpr index_t GemmABlockCopyClusterLengths_GemmK =
+        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K;
+
+    constexpr index_t GemmABlockCopyClusterLengths_GemmM =
+        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M;
+
+    constexpr index_t GemmABlockCopyThreadSliceLengths_GemmK =
+        GemmKPerBlock / GemmABlockCopyClusterLengths_GemmK;
+
+    constexpr index_t GemmABlockCopyThreadSliceLengths_GemmM =
+        GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
+
+    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM =
+        Sequence<GemmABlockCopyThreadSliceLengths_GemmK, GemmABlockCopyThreadSliceLengths_GemmM>;
+
+    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM =
+        Sequence<GemmABlockCopyClusterLengths_GemmK, GemmABlockCopyClusterLengths_GemmM>;
+
+    constexpr index_t GemmABlockCopySrcDataPerRead_GemmM =
+        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M;
+
+    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM =
+        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M;
+
+    // B matrix
+    constexpr index_t GemmBBlockCopyClusterLengths_GemmK =
+        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K;
+
+    constexpr index_t GemmBBlockCopyClusterLengths_GemmN =
+        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N;
+
+    constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmK =
+        GemmKPerBlock / GemmBBlockCopyClusterLengths_GemmK;
+
+    constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmN =
+        GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
+
+    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN =
+        Sequence<GemmBBlockCopyThreadSliceLengths_GemmK, GemmBBlockCopyThreadSliceLengths_GemmN>;
+
+    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN =
+        Sequence<GemmBBlockCopyClusterLengths_GemmK, GemmBBlockCopyClusterLengths_GemmN>;
+
+    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN =
+        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N;
+
+    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN =
+        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N;
+
+    // C matrix
+    constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE;
+    constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE;
+
+    constexpr index_t GemmThreadGemmDataPerReadM = 1;
+    constexpr index_t GemmThreadGemmDataPerReadN = 1;
+
+    constexpr auto gridwise_conv_bwd_data =
+        GridwiseConvolutionBackwardDataImplicitGemm_v4r1_xdlops_nchw_kcyx_nkhw<
+            GridSize,
+            BlockSize,
+            FLOAT,
+            FLOAT_ACCUM,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            InLeftPads,
+            InRightPads,
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmThreadGemmDataPerReadM,
+            GemmThreadGemmDataPerReadN,
+            GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
+            GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
+            GemmABlockCopySrcDataPerRead_GemmM,
+            GemmABlockCopyDstDataPerWrite_GemmM,
+            GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
+            GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
+            GemmBBlockCopySrcDataPerRead_GemmN,
+            GemmBBlockCopyDstDataPerWrite_GemmN>{};
+
+    // these decide which GEMM will be called
+    constexpr index_t GemmId = CK_PARAM_GEMM_ID;
+
+    gridwise_conv_bwd_data.template Run<GemmId>(p_in_global, p_wei_global, p_out_global);
+}
diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp
old mode 100644
new mode 100755
index 02769b7723..f128e5a83b
--- a/src/mlo_dir_conv.cpp
+++ b/src/mlo_dir_conv.cpp
@@ -139,7 +139,8 @@ static auto GetImplicitGemmSolvers()
                                            miopen::solver::ConvHipImplicitGemmV4R1Fwd,
                                            miopen::solver::ConvHipImplicitGemmV4R4Fwd,
                                            miopen::solver::ConvHipImplicitGemmBwdDataV1R1,
-                                           miopen::solver::ConvHipImplicitGemmBwdDataV4R1>{};
+                                           miopen::solver::ConvHipImplicitGemmBwdDataV4R1,
+                                           miopen::solver::ConvHipImplicitGemmBwdDataV4R1Xdlops>{};
 }
 
 static auto GetWindogradSolvers()
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
old mode 100644
new mode 100755
index 94f3f663e8..e2f6324af2
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -3104,7 +3104,9 @@ void ConvBwdImplicitGemm(const ConvolutionContext& /*ctx*/,
             elapsed += handle.GetKernelTime();
     }
     else if(kernel.GetName() ==
-            "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw")
+                "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" ||
+            kernel.GetName() ==
+                "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw")
     {
         // \todo this kernel doesn't always need to set-zero
         float zero = 0.f;
diff --git a/src/solver.cpp b/src/solver.cpp
old mode 100644
new mode 100755
index cbe6066db0..84be961d1a
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -302,6 +302,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemmBwdDataV1R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM);
+
+    RegisterWithSolver(
+        registry, ++id, ConvHipImplicitGemmBwdDataV4R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM);
 }
 
 } // namespace solver
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
new file mode 100755
index 0000000000..2f8858f71a
--- /dev/null
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -0,0 +1,688 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#include "miopen/solver.hpp"
+#include "miopen/handle.hpp"
+#include <miopen/generic_search.hpp>
+#include "implicitgemm_util.hpp"
+
+namespace miopen {
+namespace solver {
+
+std::tuple<int, bool>
+PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGridSize(const ConvolutionContext& ctx) const
+{
+    int GridSize = 0;
+
+    try
+    {
+        int gemm_m = 0;
+        int gemm_n = 0;
+
+        std::tie(gemm_m, gemm_n, std::ignore) =
+            ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(ctx, 0);
+
+        if(!(gemm_m % GemmMPerBlock == 0 && gemm_n % GemmNPerBlock == 0))
+            MIOPEN_THROW("invalid performance parameter");
+
+        GridSize = (gemm_m / GemmMPerBlock) * (gemm_n / GemmNPerBlock);
+    }
+    catch(...)
+    {
+        return std::make_tuple(-1, false);
+    }
+
+    return std::make_tuple(GridSize, true);
+}
+
+std::tuple<int, int, bool>
+PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformanceParameters(
+    const ConvolutionContext& ctx) const
+{
+    int SrcDataPerRead_GemmM  = amd_buffer_load_max_length<float>();
+    int DstDataPerWrite_GemmM = amd_lds_write_max_length<float>();
+
+    try
+    {
+        // calculate vector length on gemmk dimension
+        SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, GemmMPerBlock);
+
+        const auto y = ConvolutionContextInterpreter::GetFilterHeightY(ctx);
+        const auto x = ConvolutionContextInterpreter::GetFilterWidthX(ctx);
+
+        // \todo too conservative
+        if(!(y == 1 && x == 1))
+            SrcDataPerRead_GemmM = 1;
+
+        // calculate threadwise copy size
+        const auto a_data_per_thread_copy = GemmMPerBlock / WeiBlockCopyClusterLengths_GemmM;
+
+        if(!(a_data_per_thread_copy > 0))
+            MIOPEN_THROW("invalid performance parameter");
+
+        // GemmABlockCopySrcDataPerRead_GemmK also bounded by size of threadwise copy
+        SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, a_data_per_thread_copy);
+
+        // GemmABlockCopyDstDataPerWrite_GemmM also bounded by size of threadwise copy
+        DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, SrcDataPerRead_GemmM);
+    }
+    catch(...)
+    {
+        return std::make_tuple(-1, -1, false);
+    }
+
+    return std::make_tuple(SrcDataPerRead_GemmM, DstDataPerWrite_GemmM, true);
+}
+
+std::tuple<int, int, bool>
+PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformanceParameters(
+    const ConvolutionContext& ctx) const
+{
+    int SrcDataPerRead_GemmN  = amd_buffer_load_max_length<float>();
+    int DstDataPerWrite_GemmN = amd_lds_write_max_length<float>();
+
+    try
+    {
+        SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, GemmNPerBlock);
+
+        // calculate vector length on gemmn dimension
+        const auto y = ConvolutionContextInterpreter::GetFilterHeightY(ctx);
+        const auto x = ConvolutionContextInterpreter::GetFilterWidthX(ctx);
+
+        // \todo too conversative
+        if(y == 1 && x == 1)
+        {
+            const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx);
+            const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx);
+
+            SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, ho * wo);
+        }
+        else
+        {
+            SrcDataPerRead_GemmN = 1;
+        }
+
+        // calculate threadwise copy size
+        int b_data_per_thread_copy = GemmNPerBlock / InBlockCopyClusterLengths_GemmN;
+
+        if(!(b_data_per_thread_copy > 0))
+            MIOPEN_THROW("invalid performance parameter");
+
+        // GemmBBlockCopySrcDataPerRead_GemmN also bounded by size of threadwise copy
+        SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, b_data_per_thread_copy);
+
+        // GemmBBlockCopyDstDataPerWrite_GemmN also bounded by size of threadwise copy
+        DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, SrcDataPerRead_GemmN);
+    }
+    catch(...)
+    {
+        MIOPEN_LOG_I("catch");
+        return std::make_tuple(-1, -1, false);
+    }
+
+    return std::make_tuple(SrcDataPerRead_GemmN, DstDataPerWrite_GemmN, true);
+}
+
+bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx) const
+{
+    int GemmM = 0, GemmN = 0, GemmK = 0;
+
+    const auto& GemmKBlocks = 1;
+
+    // check blockwise GEMM size
+    for(int gemm_id = 0; gemm_id < ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateNumberOfGemm(ctx);
+        ++gemm_id)
+    {
+
+        std::tie(GemmM, GemmN, GemmK) =
+            ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(ctx, gemm_id);
+
+        if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 &&
+             GemmK % (GemmKPerBlock * GemmKBlocks) == 0))
+            return false; // wrong! cannot divice N evenly among thread
+    }
+
+    const auto& GemmBBlockCopyClusterLengths_GemmK = InBlockCopyClusterLengths_GemmK;
+    const auto& GemmBBlockCopyClusterLengths_GemmN = InBlockCopyClusterLengths_GemmN;
+    const auto& GemmABlockCopyClusterLengths_GemmK = WeiBlockCopyClusterLengths_GemmK;
+    const auto& GemmABlockCopyClusterLengths_GemmM = WeiBlockCopyClusterLengths_GemmM;
+
+    if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 &&
+         GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 &&
+         GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 &&
+         GemmMPerBlock % GemmABlockCopyClusterLengths_GemmM == 0))
+        return false;
+
+    // unsupported xdlops-gemm
+    if(GemmMPerWave == 16 && GemmNPerWave == 32)
+        return false;
+    if(GemmMPerWave == 32 && GemmNPerWave == 16)
+        return false;
+    if(GemmMPerWave == 8 && GemmNPerWave != 64)
+        return false;
+    if(GemmMPerWave == 4 && GemmNPerWave != 64)
+        return false;
+
+    const auto WaveSize  = 64;
+    const auto BlockSize = GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize;
+
+    // fail with blockSize >= 512
+    /// \todo fix the issue with blockSize >= 512
+    if(BlockSize < 64 || BlockSize > 256)
+        return false;
+
+    if(BlockSize != GemmBBlockCopyClusterLengths_GemmK * GemmBBlockCopyClusterLengths_GemmN)
+        return false;
+
+    if(BlockSize != GemmABlockCopyClusterLengths_GemmM * GemmABlockCopyClusterLengths_GemmK)
+        return false;
+
+    if((GemmMPerBlock % GemmMPerWave) != 0 || (GemmNPerBlock % GemmNPerWave) != 0)
+        return false;
+
+    const auto GemmBBlockCopyThreadSliceLengths_GemmN =
+        GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
+    const auto GemmABlockCopyThreadSliceLengths_GemmM =
+        GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
+    const auto lds_size = ComputeLDSRequiredSize(ctx,
+                                                 GemmNPerBlock,
+                                                 GemmMPerBlock,
+                                                 GemmKPerBlock,
+                                                 1,
+                                                 1,
+                                                 GemmBBlockCopyThreadSliceLengths_GemmN,
+                                                 GemmABlockCopyThreadSliceLengths_GemmM,
+                                                 true);
+    return lds_size <= 64 * 1024;
+}
+
+PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare)
+{
+    GemmNPerBlock = spare ? 16 : 64;
+    GemmMPerBlock = spare ? 4 : 64;
+    GemmKPerBlock = spare ? 4 : 8;
+
+    GemmMPerWave = spare ? 4 : 64;
+    GemmNPerWave = spare ? 16 : 64;
+
+    InBlockCopyClusterLengths_GemmK = 4;
+    InBlockCopyClusterLengths_GemmN = 4;
+
+    WeiBlockCopyClusterLengths_GemmK = 2;
+    WeiBlockCopyClusterLengths_GemmM = 4;
+
+    use_spare_set = spare;
+}
+
+PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(
+    int BPerBlock_,
+    int KPerBlock_,
+    int EPerBlock_,
+    int GemmMPerWave_,
+    int GemmNPerWave_,
+    int InBlockCopyClusterLengths_E_,
+    int InBlockCopyClusterLengths_B_,
+    int WeiBlockCopyClusterLengths_E_,
+    int WeiBlockCopyClusterLengths_K_,
+    bool use_spare_set_)
+    : GemmNPerBlock(BPerBlock_),
+      GemmMPerBlock(KPerBlock_),
+      GemmKPerBlock(EPerBlock_),
+      GemmMPerWave(GemmMPerWave_),
+      GemmNPerWave(GemmNPerWave_),
+      InBlockCopyClusterLengths_GemmK(InBlockCopyClusterLengths_E_),
+      InBlockCopyClusterLengths_GemmN(InBlockCopyClusterLengths_B_),
+      WeiBlockCopyClusterLengths_GemmK(WeiBlockCopyClusterLengths_E_),
+      WeiBlockCopyClusterLengths_GemmM(WeiBlockCopyClusterLengths_K_),
+      use_spare_set(use_spare_set_)
+{
+}
+
+bool PerformanceImplicitGemmBwdDataV4R1Xdlops::
+operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const
+{
+    // clang-format off
+    return GemmNPerBlock == other.GemmNPerBlock
+        && GemmMPerBlock == other.GemmMPerBlock
+        && GemmKPerBlock == other.GemmKPerBlock
+        && GemmMPerWave == other.GemmMPerWave
+        && GemmNPerWave == other.GemmNPerWave
+        && InBlockCopyClusterLengths_GemmK == other.InBlockCopyClusterLengths_GemmK
+        && InBlockCopyClusterLengths_GemmN == other.InBlockCopyClusterLengths_GemmN
+        && WeiBlockCopyClusterLengths_GemmK == other.WeiBlockCopyClusterLengths_GemmK
+        && WeiBlockCopyClusterLengths_GemmM == other.WeiBlockCopyClusterLengths_GemmM
+        && use_spare_set == other.use_spare_set;
+    // clang-format on
+}
+
+bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const
+{
+    // clang-format off
+    return IsTwoPower<16,128>(GemmNPerBlock)
+        && IsTwoPower<4,128>(GemmMPerBlock)
+        && IsTwoPower<4,32>(GemmKPerBlock)
+        && IsTwoPower<4,64>(GemmMPerWave)
+        && IsTwoPower<16,64>(GemmNPerWave)
+        && IsTwoPower<4,16>(InBlockCopyClusterLengths_GemmK)
+        && IsTwoPower<4,64>(InBlockCopyClusterLengths_GemmN)
+        && IsTwoPower<2,16>(WeiBlockCopyClusterLengths_GemmK)
+        && IsTwoPower<4,128>(WeiBlockCopyClusterLengths_GemmM); // clang-format on
+}
+
+bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
+{
+    do
+    {
+        if(!use_spare_set)
+        {
+            if(!NextTwoPower<64, 128>(GemmNPerBlock))
+                break;
+            if(!NextTwoPower<64, 128>(GemmMPerBlock))
+                break;
+            if(!NextTwoPower<8, 32>(GemmKPerBlock))
+                break;
+        }
+        else
+        {
+            if(!NextTwoPower<16, 128>(GemmNPerBlock))
+                break;
+            if(!NextTwoPower<4, 128>(GemmMPerBlock))
+                break;
+            if(!NextTwoPower<4, 32>(GemmKPerBlock))
+                break;
+            if(!NextTwoPower<4, 64>(GemmMPerWave))
+                break;
+            if(!NextTwoPower<16, 64>(GemmNPerWave))
+                break;
+        }
+        if(!NextTwoPower<4, 16>(InBlockCopyClusterLengths_GemmK))
+            break;
+        if(!NextTwoPower<4, 64>(InBlockCopyClusterLengths_GemmN))
+            break;
+        if(!NextTwoPower<2, 16>(WeiBlockCopyClusterLengths_GemmK))
+            break;
+        if(!NextTwoPower<4, 128>(WeiBlockCopyClusterLengths_GemmM))
+            break;
+        return false;
+    } while(false);
+
+    return true;
+}
+
+void PerformanceImplicitGemmBwdDataV4R1Xdlops::EuristicInit(const ConvolutionContext& ctx)
+{
+    PerformanceImplicitGemmBwdDataV4R1Xdlops tmp;
+    tmp = {128, 128, 8, 64, 64, 4, 64, 4, 64, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {64, 32, 4, 32, 64, 4, 16, 2, 32, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {64, 32, 4, 32, 64, 4, 16, 4, 16, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {32, 64, 4, 64, 32, 4, 16, 4, 16, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {32, 32, 4, 32, 32, 4, 16, 2, 32, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {64, 16, 4, 16, 64, 4, 16, 4, 16, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {16, 64, 4, 64, 16, 4, 16, 4, 16, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {16, 16, 4, 16, 16, 4, 16, 4, 16, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {64, 4, 16, 4, 64, 16, 4, 16, 4, use_spare_set};
+    if(!tmp.IsValid(ctx))
+        tmp = {64, 8, 8, 8, 64, 4, 16, 8, 8, use_spare_set};
+    if(!tmp.IsValid(ctx))
+    {
+        MIOPEN_LOG_E("All attempts failed");
+        assert(false);
+    }
+    *this = tmp;
+    MIOPEN_LOG_I(ToString());
+}
+
+std::string PerformanceImplicitGemmBwdDataV4R1Xdlops::ToString() const
+{
+    std::ostringstream ss;
+    Serialize(ss);
+    return ss.str();
+}
+
+int ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateNumberOfGemm(const ConvolutionContext& ctx)
+{
+    const auto conv_stride_h = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx);
+    const auto conv_stride_w = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx);
+    const auto conv_dilation_h =
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx);
+    const auto conv_dilation_w =
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx);
+
+    const auto gcd_stride_dilation_h = gcd(conv_stride_h, conv_dilation_h);
+    const auto gcd_stride_dilation_w = gcd(conv_stride_w, conv_dilation_w);
+
+    const auto ytilda = conv_stride_h / gcd_stride_dilation_h;
+    const auto xtilda = conv_stride_w / gcd_stride_dilation_w;
+
+    return ytilda * xtilda;
+}
+
+std::tuple<int, int, int>
+ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ConvolutionContext& ctx, int gemm_id)
+{
+    const auto n             = ConvolutionContextInterpreter::GetBatchN(ctx);
+    const auto k             = ConvolutionContextInterpreter::GetOutputChannelK(ctx);
+    const auto c             = ConvolutionContextInterpreter::GetInputChannelC(ctx);
+    const auto hi            = ConvolutionContextInterpreter::GetInputHeightHi(ctx);
+    const auto wi            = ConvolutionContextInterpreter::GetInputWidthWi(ctx);
+    const auto ho            = ConvolutionContextInterpreter::GetOutputHeightHo(ctx);
+    const auto wo            = ConvolutionContextInterpreter::GetOutputWidthWo(ctx);
+    const auto y             = ConvolutionContextInterpreter::GetFilterHeightY(ctx);
+    const auto x             = ConvolutionContextInterpreter::GetFilterWidthX(ctx);
+    const auto conv_stride_h = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx);
+    const auto conv_stride_w = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx);
+    const auto conv_dilation_h =
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx);
+    const auto conv_dilation_w =
+        ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx);
+    const auto in_left_pad_h = ConvolutionContextInterpreter::GetInputLeftPadH(ctx);
+    const auto in_left_pad_w = ConvolutionContextInterpreter::GetInputLeftPadW(ctx);
+
+    const auto gcd_stride_dilation_h = gcd(conv_stride_h, conv_dilation_h);
+    const auto gcd_stride_dilation_w = gcd(conv_stride_w, conv_dilation_w);
+
+    const auto ytilda = conv_stride_h / gcd_stride_dilation_h;
+    const auto xtilda = conv_stride_w / gcd_stride_dilation_w;
+
+    const auto ydot = integer_divide_ceil(y, ytilda);
+    const auto xdot = integer_divide_ceil(x, xtilda);
+
+    const auto htilda = ho + integer_divide_ceil(conv_dilation_h * (y - 1), conv_stride_h);
+    const auto wtilda = wo + integer_divide_ceil(conv_dilation_w * (x - 1), conv_stride_w);
+
+    // intermediate result could be negative, use int instead of size_t
+    const auto htilda_left =
+        std::max(0, in_left_pad_h - conv_dilation_h * (ytilda - 1)) / conv_stride_h;
+    const auto wtilda_left =
+        std::max(0, in_left_pad_w - conv_dilation_w * (xtilda - 1)) / conv_stride_w;
+
+    const auto htilda_right =
+        std::min(htilda, integer_divide_ceil(in_left_pad_h + hi - 1, conv_stride_h) + 1);
+    const auto wtilda_right =
+        std::min(wtilda, integer_divide_ceil(in_left_pad_w + wi - 1, conv_stride_w) + 1);
+
+    const auto htilda_slice = htilda_right - htilda_left;
+    const auto wtilda_slice = wtilda_right - wtilda_left;
+
+    // gemm_k size is different for each GEMM
+    const auto i_ytilda = gemm_id / xtilda;
+    const auto i_xtilda = gemm_id % xtilda;
+
+    const auto ydot_slice = (i_ytilda + 1) * ydot <= y ? ydot : y % ydot;
+    const auto xdot_slice = (i_xtilda + 1) * xdot <= x ? xdot : x % xdot;
+
+    const auto gemm_m = c;
+    const auto gemm_n = n * htilda_slice * wtilda_slice;
+    const auto gemm_k = k * ydot_slice * xdot_slice;
+
+    return std::make_tuple(gemm_m, gemm_n, gemm_k);
+}
+
+// TODO: add fp16 and bfp16 by ConvHipImplicitGemmBwdDataV4R1Xdlops::GetWorkspaceSize(const
+// ConvolutionContext& ctx) const
+
+bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx) const
+{
+#if WORKAROUND_SWDEV_229277_227616_229195
+    if(!IsHccCompiler())
+        return false;
+#endif
+    bool is_applicable = true;
+
+    if(!ctx.direction.IsBackwardData())
+        return false;
+
+    if(!ctx.Is2d())
+        return false;
+
+    if(!ctx.IsFp32())
+        return false;
+
+    if(ctx.group_counts != 1)
+        return false;
+
+    if(!IsApplicableXdlops(ctx))
+        return false;
+
+    int gemm_m = 0;
+    int gemm_n = 0;
+
+    std::tie(gemm_m, gemm_n, std::ignore) = CalculateGemmSize(ctx, 0);
+
+    is_applicable = is_applicable && gemm_m % 32 == 0 && gemm_n % 32 == 0;
+
+    for(int gemm_id = 0; gemm_id < CalculateNumberOfGemm(ctx); ++gemm_id)
+    {
+        int gemm_k = 0;
+
+        std::tie(std::ignore, std::ignore, gemm_k) = CalculateGemmSize(ctx, gemm_id);
+
+        is_applicable = is_applicable && gemm_k % 4 == 0;
+    }
+
+    return is_applicable;
+}
+
+PerformanceImplicitGemmBwdDataV4R1Xdlops
+ConvHipImplicitGemmBwdDataV4R1Xdlops::GetPerformanceConfig(const ConvolutionContext& ctx) const
+{
+    return GetPerformanceConfigBase<PerformanceImplicitGemmBwdDataV4R1Xdlops>(ctx);
+}
+
+bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig(
+    const ConvolutionContext& ctx, const PerformanceImplicitGemmBwdDataV4R1Xdlops& c) const
+{
+    MIOPEN_LOG_I("");
+    return c.IsValidValue() && c.IsValid(ctx);
+}
+PerformanceImplicitGemmBwdDataV4R1Xdlops
+ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx) const
+{
+
+    // \todo add fp16 and bfp16 kernels
+    return GenericSearchBwd(*this, ctx);
+
+    // fp16/bfp16 uses fp32 workspace to leverage fp32 atomic add
+    // if(ctx.IsFp16() || ctx.IsBfp16())
+    //     return GenericSearchBwd(*this, ctx, SearchTweak::WorkspaceInsteadOfXBuffer);
+    // else
+    //     return GenericSearchBwd(*this, ctx);
+}
+
+int ConvHipImplicitGemmBwdDataV4R1Xdlops::RunAndMeasureSolution(miopen::Handle& profile_h,
+                                                                ConstData_t bot_buf,
+                                                                Data_t top_buf,
+                                                                ConstData_t wei_buf,
+                                                                ConstData_t bias_buf,
+                                                                const ConvolutionContext&,
+                                                                const ConvSolution& solution,
+                                                                float& elapsed_time) const
+{
+    assert(bias_buf == nullptr);
+    (void)bias_buf;
+
+#ifdef NDEBUG
+    try
+#endif
+    {
+
+        elapsed_time = float(0);
+
+        for(auto& k_info : solution.construction_params)
+        {
+
+            auto kernel = profile_h.AddKernel("",
+                                              "",
+                                              k_info.kernel_file,
+                                              k_info.kernel_name,
+                                              k_info.l_wk,
+                                              k_info.g_wk,
+                                              k_info.comp_options);
+
+            kernel(bot_buf, wei_buf, top_buf);
+
+            elapsed_time += profile_h.GetKernelTime();
+        }
+    }
+
+#ifdef NDEBUG
+    catch(miopen::Exception& ex)
+    {
+        MIOPEN_LOG_WE(ex.what());
+        return -1;
+    }
+#endif
+    return 0;
+}
+
+ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
+    const ConvolutionContext& ctx,
+    const PerformanceImplicitGemmBwdDataV4R1Xdlops& config,
+    bool) const
+{
+    ConvSolution result;
+
+    assert(config.IsValid(ctx));
+
+    // a series of kernels
+    for(std::size_t gemm_id = 0; gemm_id < CalculateNumberOfGemm(ctx); ++gemm_id)
+    {
+        KernelInfo construction_parameters;
+
+        int gemm_m = 0;
+        int gemm_n = 0;
+        int gemm_k = 0;
+
+        std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx, gemm_id);
+
+        // don't compile or launch an empty gridwise GEMM
+        if(gemm_k > 0)
+        {
+            int grid_size = 0;
+
+            const std::size_t GemmMPerBlock = config.GemmMPerBlock;
+            const std::size_t GemmNPerBlock = config.GemmNPerBlock;
+            const std::size_t GemmKPerBlock = config.GemmKPerBlock;
+            const std::size_t GemmMPerWave  = config.GemmMPerWave;
+            const std::size_t GemmNPerWave  = config.GemmNPerWave;
+
+            const std::size_t block_size =
+                GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * wave_size;
+
+            std::tie(grid_size, std::ignore) = config.CalculateGridSize(ctx);
+
+            construction_parameters.l_wk.push_back(block_size);
+            construction_parameters.l_wk.push_back(1);
+            construction_parameters.l_wk.push_back(1);
+
+            construction_parameters.g_wk.push_back(block_size * grid_size);
+            construction_parameters.g_wk.push_back(1);
+            construction_parameters.g_wk.push_back(1);
+
+            construction_parameters.kernel_file =
+                "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp";
+
+            construction_parameters.kernel_name =
+                "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw";
+
+            // TODO: add fp16 calculation by GetWorkspaceSize(ctx);
+            result.workspce_sz = 0;
+
+            std::size_t GemmABlockCopySrcDataPerRead_GemmM  = 1;
+            std::size_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
+            std::size_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
+            std::size_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
+
+            std::tie(GemmABlockCopySrcDataPerRead_GemmM,
+                     GemmABlockCopyDstDataPerWrite_GemmM,
+                     std::ignore) = config.CalculateGemmABlockCopyPerformanceParameters(ctx);
+
+            std::tie(GemmBBlockCopySrcDataPerRead_GemmN,
+                     GemmBBlockCopyDstDataPerWrite_GemmN,
+                     std::ignore) = config.CalculateGemmBBlockCopyPerformanceParameters(ctx);
+
+            // clang-format off
+            construction_parameters.comp_options = 
+                std::string(" -std=c++14 ") +
+                std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(ConvolutionContextInterpreter::GetBatchN(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(ConvolutionContextInterpreter::GetOutputChannelK(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_C=") + std::to_string(ConvolutionContextInterpreter::GetInputChannelC(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_HI=") + std::to_string(ConvolutionContextInterpreter::GetInputHeightHi(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_WI=") + std::to_string(ConvolutionContextInterpreter::GetInputWidthWi(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_HO=") + std::to_string(ConvolutionContextInterpreter::GetOutputHeightHo(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_WO=") + std::to_string(ConvolutionContextInterpreter::GetOutputWidthWo(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_Y=") + std::to_string(ConvolutionContextInterpreter::GetFilterHeightY(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_X=") + std::to_string(ConvolutionContextInterpreter::GetFilterWidthX(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_CONV_STRIDE_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_CONV_STRIDE_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_CONV_DILATION_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_CONV_DILATION_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_IN_LEFT_PAD_H=") + std::to_string(ConvolutionContextInterpreter::GetInputLeftPadH(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_IN_LEFT_PAD_W=") + std::to_string(ConvolutionContextInterpreter::GetInputLeftPadW(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_IN_RIGHT_PAD_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_IN_RIGHT_PAD_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx)) +
+                std::string(" -DCK_PARAM_PROBLEM_CONV_GROUP_COUNTS=") + std::to_string(ctx.group_counts) +
+                std::string(" -DCK_PARAM_TUNABLE_BLOCK_SIZE=") + std::to_string(block_size) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_M_PER_BLOCK=") + std::to_string(GemmMPerBlock) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_N_PER_BLOCK=") + std::to_string(GemmNPerBlock) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) +
+                std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) +
+                std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmM) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM     ) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.InBlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.InBlockCopyClusterLengths_GemmN) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN     ) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) +
+                std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +
+                std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
+                std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_ADD=") + (support_amd_buffer_atomic_add(ctx) ? '1' : '0') +
+                std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
+                std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +
+                std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
+                std::string(" -DCK_PARAM_GEMM_ID=") + std::to_string(gemm_id) +
+                std::string(" -D__HIP_PLATFORM_HCC__=1") +
+                ctx.general_compile_options;
+        
+            result.construction_params.push_back(construction_parameters);
+            
+        }
+    }
+    return result;
+}
+
+} // namespace solver
+} // namespace miopen

From 13e52273b030fd854ffac529126e78f0fdeff10f Mon Sep 17 00:00:00 2001
From: root <root@dc-smc-25.amd.com>
Date: Tue, 21 Apr 2020 12:49:51 +0000
Subject: [PATCH 02/18] add -abseil-string-find-startswith in CMakeList

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2533e0571..16a517745a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -298,6 +298,7 @@ include(ClangTidy)
 enable_clang_tidy(
     CHECKS
         *
+        -abseil-string-find-startswith
         -android-cloexec-fopen
         # Yea we shouldn't be using rand()
         -cert-msc30-c

From fe5a8b21ce36a16110dd63b8372fbd9cdadeb139 Mon Sep 17 00:00:00 2001
From: root <root@dc-smc-25.amd.com>
Date: Wed, 22 Apr 2020 09:25:17 +0800
Subject: [PATCH 03/18] modify caller of ComputeLDSRequiredSize

---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 2f8858f71a..dcdbef7e89 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -215,7 +215,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext&
                                                  1,
                                                  GemmBBlockCopyThreadSliceLengths_GemmN,
                                                  GemmABlockCopyThreadSliceLengths_GemmM,
-                                                 true);
+                                                 1);
     return lds_size <= 64 * 1024;
 }
 

From 3205ab7ec2c3abb62059cebd69163e8b05e990d2 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 09:41:00 +0800
Subject: [PATCH 04/18] merge develop branch

---
 src/conv/invokers/impl_gemm.cpp               | 205 ++++++++++++++++++
 .../miopen/conv/invokers/impl_gemm.hpp        |  41 ++++
 2 files changed, 246 insertions(+)
 create mode 100644 src/conv/invokers/impl_gemm.cpp
 create mode 100644 src/include/miopen/conv/invokers/impl_gemm.hpp

diff --git a/src/conv/invokers/impl_gemm.cpp b/src/conv/invokers/impl_gemm.cpp
new file mode 100644
index 0000000000..4b2ddc8ddc
--- /dev/null
+++ b/src/conv/invokers/impl_gemm.cpp
@@ -0,0 +1,205 @@
+#include <miopen/conv/invokers/impl_gemm.hpp>
+
+#include <miopen/conv/data_invoke_params.hpp>
+#include <miopen/algorithm.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/tensor_ops.hpp>
+
+#include <boost/any.hpp>
+
+namespace miopen {
+namespace conv {
+
+InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx)
+{
+    if(ctx.direction.IsForward())
+    {
+        return [](const std::vector<Kernel>& kernels) {
+            return [=](Handle& handle, const boost::any& primitive_parameters) {
+                const auto data_ctx = boost::any_cast<conv::DataInvokeParams>(primitive_parameters);
+                const auto& tensors = data_ctx.tensors;
+                handle.Run(kernels[0])(tensors.in, tensors.w, tensors.out);
+            };
+        };
+    }
+    else
+    {
+        const auto& conv       = ctx.conv_problem.GetConv();
+        const auto& lowp_quant = conv.lowp_quant;
+
+        return [conv, lowp_quant](const std::vector<Kernel>& kernels) {
+            return [=](Handle& handle, const boost::any& primitive_parameters) {
+                const auto data_ctx = boost::any_cast<conv::DataInvokeParams>(primitive_parameters);
+                const auto& tensors = data_ctx.tensors;
+                const auto& workSpace = data_ctx.workSpace;
+
+                // Miminum checks. Only check what is required to select
+                // proper invocation procedure & workspace sanity.
+                auto kernel = handle.Run(kernels[0]);
+
+                float elapsed = 0;
+                // clang-format off
+                if((tensors.outDesc.GetType() == miopenHalf ||
+                    tensors.outDesc.GetType() == miopenBFloat16) &&
+                   (kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw"))
+                // clang-format on
+                {
+                    float zero = 0.f;
+                    TensorDescriptor workspaceDesc(
+                        miopenFloat, tensors.outDesc.GetLengths(), tensors.outDesc.GetStrides());
+                    SetTensor(handle, workspaceDesc, workSpace, &zero);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+
+                    kernel(tensors.in, tensors.w, workSpace);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+
+                    CastTensor(handle,
+                               &lowp_quant,
+                               workspaceDesc,
+                               workSpace,
+                               tensors.outDesc,
+                               tensors.out,
+                               0,
+                               0);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+                }
+                // clang-format off
+                else if((kernel.GetName() == "gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer") ||
+                        (kernel.GetName() == "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer"))
+                // clang-format on
+                {
+                    bool hasStride =
+                        (tensors.inDesc.GetLengths()[2] != tensors.outDesc.GetLengths()[2]) ||
+                        (tensors.inDesc.GetLengths()[3] != tensors.outDesc.GetLengths()[3]);
+                    /// \todo set zero within implicitGEMM kernel
+                    if(hasStride)
+                    {
+                        MIOPEN_LOG_I2("hasStride, call SetTensor with zero");
+                        float zero = 0.f;
+                        SetTensor(handle, tensors.outDesc, tensors.out, &zero);
+
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
+                    }
+
+                    kernel(tensors.in, tensors.w, tensors.out);
+
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+                }
+                // clang-format off
+                else if(kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" ||
+                        kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw")
+                // clang-format on
+                {
+                    float zero = 0.f;
+                    SetTensor(handle, tensors.outDesc, tensors.out, &zero);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+
+                    kernel(tensors.in, tensors.w, tensors.out);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+                }
+                // clang-format off
+                else if(
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw")
+                // clang-format on
+                {
+                    // this kernel accumulate results into input tensor, therefore need to set zero
+                    bool is_1x1_s1 = false;
+                    if(miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; }) &&
+                       miopen::all_of(conv.GetConvStrides(), [](auto v) { return v == 1; }))
+                    {
+                        if(tensors.wDesc.GetLengths()[2] == 1 && tensors.wDesc.GetLengths()[3] == 1)
+                        { // filter = 1
+                            if(tensors.wDesc.GetSize() == 4 ||
+                               (tensors.wDesc.GetSize() == 5 && tensors.wDesc.GetLengths()[4] == 1))
+                            {
+                                is_1x1_s1 = true;
+                            }
+                        }
+                    }
+
+                    if(!is_1x1_s1)
+                    {
+                        float zero = 0.f;
+                        SetTensor(handle, tensors.outDesc, tensors.out, &zero);
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
+                    }
+
+                    kernel(tensors.in, tensors.w, tensors.out);
+                    if(handle.IsProfilingEnabled())
+                        elapsed += handle.GetKernelTime();
+                }
+                // clang-format off
+                else if(
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw")
+                // clang-format on
+                {
+                    // \todo this kernel doesn't always need to set-zero
+                    bool filterGeStride = false;
+                    if(miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; }))
+                    {
+                        if(tensors.wDesc.GetSize() == 4)
+                        { // 2d
+                            if(tensors.wDesc.GetLengths()[2] >= conv.GetConvStrides()[0] &&
+                               tensors.wDesc.GetLengths()[3] >= conv.GetConvStrides()[1])
+                            {
+                                filterGeStride = true;
+                            }
+                        }
+                        else
+                        { // 3d
+                            if(tensors.wDesc.GetLengths()[2] >= conv.GetConvStrides()[0] &&
+                               tensors.wDesc.GetLengths()[3] >= conv.GetConvStrides()[1] &&
+                               tensors.wDesc.GetLengths()[4] >= conv.GetConvStrides()[2])
+                            {
+                                filterGeStride = true;
+                            }
+                        }
+                    }
+
+                    if(!filterGeStride)
+                    {
+                        float zero = 0.f;
+                        SetTensor(handle, tensors.outDesc, tensors.out, &zero);
+
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
+                    }
+
+                    // a group kernels (compiled from same source code) will be launched
+                    for(const auto& k : kernels)
+                    {
+                        handle.Run(k)(tensors.in, tensors.w, tensors.out);
+                        elapsed += handle.GetKernelTime();
+                    }
+                }
+                else
+                {
+                    MIOPEN_THROW(
+                        "Error running implicit GEMM backward data convolution (none workspace?)");
+                }
+
+                if(handle.IsProfilingEnabled())
+                {
+                    handle.ResetKernelTime();
+                    handle.AccumKernelTime(elapsed);
+                }
+            };
+        };
+    }
+}
+
+} // namespace conv
+} // namespace miopen
diff --git a/src/include/miopen/conv/invokers/impl_gemm.hpp b/src/include/miopen/conv/invokers/impl_gemm.hpp
new file mode 100644
index 0000000000..5b6dd1aa16
--- /dev/null
+++ b/src/include/miopen/conv/invokers/impl_gemm.hpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+*
+* MIT License
+*
+* Copyright (c) 2019 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoker.hpp>
+#include <miopen/kernel.hpp>
+#include <miopen/conv/context.hpp>
+
+#include <vector>
+
+namespace miopen {
+namespace conv {
+
+InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx);
+
+} // namespace conv
+} // namespace miopen

From c2abbb7669e1fb6c700c637639aedfb825d11e3b Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 09:42:01 +0800
Subject: [PATCH 05/18] merge develop branch 1

---
 Jenkinsfile                                   |   4 +-
 src/binary_cache.cpp                          |   6 +-
 src/find_db.cpp                               |   4 +-
 .../miopen/conv/problem_description.hpp       |  10 +
 src/include/miopen/execution_context.hpp      |  34 ++-
 src/include/miopen/kern_db.hpp                |  61 ++---
 src/include/miopen/problem_description.hpp    |   3 +
 src/include/miopen/sqlite_db.hpp              | 259 ++++++------------
 src/kern_db.cpp                               |  33 +--
 ...r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp |   6 +-
 ..._v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp |   6 +-
 ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp |   8 +-
 ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp |  12 +-
 ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp |   8 +-
 .../gridwise_gemm_fp16_bfp16.hpp              |   6 +-
 .../gridwise_gemm_xdlops_fp16_bfp16.hpp       |  12 +-
 .../include/tensor_operation/xdlops_gemm.hpp  |  36 ++-
 .../include/utility/float_type.hpp            |  45 +--
 ...ps_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp |  14 +-
 ...dlops_nchw_kcyx_nkhw_lds_double_buffer.cpp |  14 +-
 src/kernels/float_types.h                     |   2 +-
 src/ocl/gcn_asm_utils.cpp                     |  28 +-
 src/ocl/utilocl.cpp                           | 104 +++----
 .../conv_hip_implicit_gemm_bwd_data_v1r1.cpp  |  10 +-
 ...hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp |  10 +-
 .../conv_hip_implicit_gemm_bwd_data_v4r1.cpp  |  16 +-
 src/solver/conv_hip_implicit_gemm_v4.cpp      |  17 +-
 src/solver/conv_hip_implicit_gemm_v4r1.cpp    |  18 +-
 src/solver/conv_hip_implicit_gemm_v4r4.cpp    |  10 +-
 ...conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp |  14 +-
 ...implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp |  13 +-
 .../conv_hip_implicit_gemm_v4r4_xdlops.cpp    |  28 +-
 src/solver/conv_ocl_dir2D_bwdWrW_2.cpp        |   2 +
 src/sqlite_db.cpp                             | 218 ++++++++++++++-
 test/find_db.cpp                              |   1 +
 test/sqlite_perfdb.cpp                        |  66 ++---
 36 files changed, 655 insertions(+), 483 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 27379571cf..c0851e48bb 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -250,7 +250,7 @@ pipeline {
                     }
                 }
 
-                stage('Hip clang release') {
+                stage('Hip clang debug') {
                     agent{ label rocmnode("vega") }
                     environment{
                         cmd = """
@@ -258,7 +258,7 @@ pipeline {
                             rm -rf build
                             mkdir build
                             cd build
-                            CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. 
+                            CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. 
                             CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check
                         """
 
diff --git a/src/binary_cache.cpp b/src/binary_cache.cpp
index 9fef73ae0b..5e48baa449 100644
--- a/src/binary_cache.cpp
+++ b/src/binary_cache.cpp
@@ -60,9 +60,9 @@ boost::filesystem::path ComputeUserCachePath()
 #ifdef MIOPEN_CACHE_DIR
     std::string cache_dir = MIOPEN_CACHE_DIR;
 
-    std::string version = std::to_string(MIOPEN_VERSION_MAJOR) + "." +
-                          std::to_string(MIOPEN_VERSION_MINOR) + "." +
-                          std::to_string(MIOPEN_VERSION_PATCH);
+    std::string version =
+        std::to_string(MIOPEN_VERSION_MAJOR) + "." + std::to_string(MIOPEN_VERSION_MINOR) + "." +
+        std::to_string(MIOPEN_VERSION_PATCH) + "." + MIOPEN_STRINGIZE(MIOPEN_VERSION_TWEAK);
 
     auto p = boost::filesystem::path{miopen::ExpandUser(cache_dir)} / version;
     if(!boost::filesystem::exists(p))
diff --git a/src/find_db.cpp b/src/find_db.cpp
index 68664ed0ec..936ad8f06e 100644
--- a/src/find_db.cpp
+++ b/src/find_db.cpp
@@ -63,7 +63,9 @@ bool CheckInvokerSupport(const std::string& algo)
            algo == "miopenConvolutionBwdDataAlgoDirect" ||
            algo == "miopenConvolutionBwdWeightsAlgoDirect" ||
            algo == "miopenConvolutionFwdAlgoWinograd" ||
-           algo == "miopenConvolutionBwdDataAlgoWinograd";
+           algo == "miopenConvolutionBwdDataAlgoWinograd" ||
+           algo == "miopenConvolutionFwdAlgoImplicitGEMM" ||
+           algo == "miopenConvolutionBwdDataAlgoImplicitGEMM";
 }
 
 template <class TDb>
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 33c61cd206..2cbe15e5d2 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -224,6 +224,7 @@ struct ProblemDescription
     const TensorDescriptor& GetIn() const { return in; }
     const TensorDescriptor& GetWeights() const { return weights; }
     const TensorDescriptor& GetOut() const { return out; }
+    const ConvolutionDescriptor& GetConv() const { return conv; }
     Direction GetDirection() const { return direction; }
     int GetBias() const { return bias; }
 
@@ -235,6 +236,15 @@ struct ProblemDescription
     std::size_t GetBackwardPadW() const { return GetWeightsWidth() - GetPadW() - 1; }
     std::size_t GetBackwardPadH() const { return GetWeightsHeight() - GetPadW() - 1; }
 
+    bool IsAsymmetricPadH() const
+    {
+        return conv.paddingMode == miopenPaddingSame && (GetWeightsHeight() % 2) == 0;
+    }
+    bool IsAsymmetricPadW() const
+    {
+        return conv.paddingMode == miopenPaddingSame && (GetWeightsWidth() % 2) == 0;
+    }
+
     bool Is2d() const { return GetSpatialDims() == 2; }
 
     bool IsFp32() const
diff --git a/src/include/miopen/execution_context.hpp b/src/include/miopen/execution_context.hpp
index b695f5b300..0e7374fc76 100644
--- a/src/include/miopen/execution_context.hpp
+++ b/src/include/miopen/execution_context.hpp
@@ -28,6 +28,7 @@
 
 #include <miopen/db_path.hpp>
 #include <miopen/handle.hpp>
+#include <miopen/sqlite_db.hpp>
 
 #include <string>
 
@@ -83,32 +84,39 @@ struct ExecutionContext
 
     std::string GetPerfDbPath() const
     {
-        // clang-format off
-        return GetSystemDbPath()
+        boost::filesystem::path pdb_path(GetSystemDbPath());
+        std::ostringstream filename;
+// clang-format off
 #if MIOPEN_ENABLE_SQLITE
-            + "/miopen.db";
+            filename << "miopen.db";
 #else
-            + "/"
-            + GetStream().GetDbBasename()
-            + ".cd.pdb.txt";
+            filename << GetStream().GetDbBasename()
+            << ".cd.pdb.txt";
 #endif
         // clang-format on
+        return (pdb_path / filename.str()).string();
     }
 
     std::string GetUserPerfDbPath() const
     {
+        // an empty user-db path indicates user intent to disable
+        // the database. Default in when dev builds are on
         // clang-format off
-        return GetUserDbPath()
+	const auto& udb = GetUserDbPath();
+	if(udb.empty())
+		return "";
+        boost::filesystem::path pdb_path(udb);
+        std::ostringstream filename;
 #if MIOPEN_ENABLE_SQLITE
-             + "/miopen.udb";
+             filename << "miopen_" << SQLitePerfDb::MIOPEN_PERFDB_SCHEMA_VER << ".udb";
 #else
-             + "/"
-             + GetStream().GetDbBasename()
-             + "."
-             + GetUserDbSuffix()
-             + ".cd.updb.txt";
+             filename << GetStream().GetDbBasename()
+             << "."
+             << GetUserDbSuffix()
+             << ".cd.updb.txt";
 #endif
         // clang-format on
+        return (pdb_path / filename.str()).string();
     }
 
     private:
diff --git a/src/include/miopen/kern_db.hpp b/src/include/miopen/kern_db.hpp
index a471ae8aef..d176c59aa2 100644
--- a/src/include/miopen/kern_db.hpp
+++ b/src/include/miopen/kern_db.hpp
@@ -105,13 +105,13 @@ class KernDb : public SQLiteBase<KernDb>
             return true;
         auto del_query =
             "DELETE FROM " + T::table_name() + " WHERE " + problem_config.Where() + ";";
-        sqlite3_stmt_ptr pStmt = Prepare(del_query);
-        auto rc                = SQLRety([&]() { return sqlite3_step(pStmt.get()); });
+        auto stmt = SQLite::Statement{sql, del_query};
+        auto rc   = stmt.Step(sql);
         if(rc == SQLITE_DONE)
             return true;
         else
         {
-            MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
+            MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
             return false;
         }
     }
@@ -124,18 +124,15 @@ class KernDb : public SQLiteBase<KernDb>
         // Where clause with inserted values defeats the purpose of a prepraed statement
         auto select_query = "SELECT kernel_blob, kernel_hash, uncompressed_size FROM " +
                             T::table_name() + " WHERE " + problem_config.Where() + ";";
-        sqlite3_stmt_ptr pStmt = Prepare(select_query);
+        auto stmt = SQLite::Statement{sql, select_query};
         // only one result field
         // assert one row
-        auto rc = SQLRety([&]() { return sqlite3_step(pStmt.get()); });
+        auto rc = stmt.Step(sql);
         if(rc == SQLITE_ROW)
         {
-            auto ptr = sqlite3_column_blob(pStmt.get(), 0);
-            auto sz  = sqlite3_column_bytes(pStmt.get(), 0);
-            std::string compressed_blob(reinterpret_cast<const char*>(ptr), sz);
-            std::string md5_hash(reinterpret_cast<const char*>(sqlite3_column_text(pStmt.get(), 1)),
-                                 sqlite3_column_bytes(pStmt.get(), 1));
-            auto uncompressed_size         = sqlite3_column_int64(pStmt.get(), 2);
+            auto compressed_blob           = stmt.ColumnBlob(0);
+            auto md5_hash                  = stmt.ColumnText(1);
+            auto uncompressed_size         = stmt.ColumnInt64(2);
             std::string& decompressed_blob = compressed_blob;
             if(uncompressed_size != 0)
             {
@@ -149,7 +146,7 @@ class KernDb : public SQLiteBase<KernDb>
         else if(rc == SQLITE_DONE)
             return boost::none;
         else
-            MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
+            MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
         return boost::none;
     }
 
@@ -165,44 +162,24 @@ class KernDb : public SQLiteBase<KernDb>
         auto uncompressed_size = problem_config.kernel_blob.size();
         bool success           = false;
         auto compressed_blob   = compress_fn(problem_config.kernel_blob, &success);
-        sqlite3_stmt_ptr pStmt = Prepare(insert_query);
-        sqlite3_bind_text(pStmt.get(),
-                          1,
-                          problem_config.kernel_name.data(),
-                          problem_config.kernel_name.size(),
-                          SQLITE_TRANSIENT); // NOLINT
-        sqlite3_bind_text(pStmt.get(),
-                          2,
-                          problem_config.kernel_args.data(),
-                          problem_config.kernel_args.size(),
-                          SQLITE_TRANSIENT); // NOLINT
+        auto stmt              = SQLite::Statement{sql, insert_query};
+        stmt.BindText(1, problem_config.kernel_name);
+        stmt.BindText(2, problem_config.kernel_args);
         if(!success)
         {
-            sqlite3_bind_blob(pStmt.get(),
-                              3,
-                              problem_config.kernel_blob.data(),
-                              problem_config.kernel_blob.size(),
-                              SQLITE_TRANSIENT); // NOLINT
-            sqlite3_bind_int64(pStmt.get(), 5, 0);
+            stmt.BindBlob(3, problem_config.kernel_blob);
+            stmt.BindInt64(5, 0);
         }
         else
         {
-            sqlite3_bind_blob(pStmt.get(),
-                              3,
-                              compressed_blob.data(),
-                              compressed_blob.size(),
-                              SQLITE_TRANSIENT); // NOLINT
-            sqlite3_bind_int64(pStmt.get(), 5, uncompressed_size);
+            stmt.BindBlob(3, compressed_blob);
+            stmt.BindInt64(5, uncompressed_size);
         }
-        sqlite3_bind_text(pStmt.get(),
-                          4,
-                          md5_sum.data(),
-                          md5_sum.size(),
-                          SQLITE_TRANSIENT); // NOLINT
+        stmt.BindText(4, md5_sum);
 
-        auto rc = SQLRety([&]() { return sqlite3_step(pStmt.get()); });
+        auto rc = stmt.Step(sql);
         if(rc != SQLITE_DONE)
-            MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
+            MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
         return problem_config.kernel_blob;
     }
 };
diff --git a/src/include/miopen/problem_description.hpp b/src/include/miopen/problem_description.hpp
index ea51887176..f6bf8d9766 100644
--- a/src/include/miopen/problem_description.hpp
+++ b/src/include/miopen/problem_description.hpp
@@ -174,6 +174,9 @@ struct ProblemDescription
     int GetBackwardPadW() const { return kernel_size_w - pad_w - 1; }
     int GetBackwardPadH() const { return kernel_size_h - pad_h - 1; }
 
+    bool IsAsymmetricPadH() const { return conv_problem.IsAsymmetricPadH(); }
+    bool IsAsymmetricPadW() const { return conv_problem.IsAsymmetricPadW(); }
+
     bool Is2d() const { return spatial_dims == 2; }
     bool Is3d() const { return spatial_dims == 3; }
 
diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp
index 107435e986..3c80254087 100644
--- a/src/include/miopen/sqlite_db.hpp
+++ b/src/include/miopen/sqlite_db.hpp
@@ -155,22 +155,58 @@ struct SQLiteSerializable
     }
 };
 
+class SQLite
+{
+    class impl;
+    // do we need propagate const
+    std::unique_ptr<impl> pImpl;
+
+    public:
+    class Statement
+    {
+        class impl;
+        std::unique_ptr<impl> pImpl;
+
+        public:
+        Statement(const SQLite& sql, const std::string& query);
+        Statement(const SQLite& sql,
+                  const std::string& query,
+                  const std::vector<std::string>& vals);
+        Statement();
+        ~Statement();
+        Statement(Statement&&) noexcept;
+        Statement& operator=(Statement&&) noexcept;
+        Statement& operator=(const Statement&) = delete;
+        int Step(const SQLite& sql);
+        std::string ColumnText(int idx);
+        std::string ColumnBlob(int idx);
+        int64_t ColumnInt64(int idx);
+        int BindText(int idx, const std::string& txt);
+        int BindBlob(int idx, const std::string& blob);
+        int BindInt64(int idx, int64_t);
+    };
+
+    using result_type = std::vector<std::unordered_map<std::string, std::string>>;
+    SQLite();
+    SQLite(const std::string& filename_, bool is_system);
+    ~SQLite();
+    SQLite(SQLite&&) noexcept;
+    SQLite& operator=(SQLite&&) noexcept;
+    SQLite& operator=(const SQLite&) = delete;
+    bool Valid() const;
+    result_type Exec(const std::string& query) const;
+    int Changes() const;
+    int Retry(std::function<int()>) const;
+    static int Retry(std::function<int()> f, std::string filename);
+    std::string ErrorMessage() const;
+};
+
 template <typename Derived>
 class SQLiteBase
 {
     protected:
-    struct SQLiteCloser
-    {
-        void operator()(sqlite3* ptr)
-        {
-            std::string filename_(sqlite3_db_filename(ptr, "main"));
-            SQLiteBase::SQLRety([&]() { return sqlite3_close(ptr); }, filename_);
-        }
-    };
-    using sqlite3_ptr      = std::unique_ptr<sqlite3, SQLiteCloser>;
-    using exclusive_lock   = boost::unique_lock<LockFile>;
-    using shared_lock      = boost::shared_lock<LockFile>;
-    using sqlite3_stmt_ptr = MIOPEN_MANAGE_PTR(sqlite3_stmt*, sqlite3_finalize);
+    using exclusive_lock = boost::unique_lock<LockFile>;
+    using shared_lock    = boost::shared_lock<LockFile>;
     static boost::system_time GetLockTimeout()
     {
         return boost::get_system_time() + boost::posix_time::milliseconds(60000);
@@ -199,6 +235,11 @@ class SQLiteBase
         {
             auto file            = boost::filesystem::path(filename_);
             const auto directory = file.remove_filename();
+            if(directory.string().empty())
+            {
+                dbInvalid = true;
+                return;
+            }
 
             if(!(boost::filesystem::exists(directory)))
             {
@@ -208,26 +249,15 @@ class SQLiteBase
                     boost::filesystem::permissions(directory, boost::filesystem::all_all);
             }
         }
-        sqlite3* ptr_tmp;
-        int rc = 0;
-        if(is_system)
-            rc = sqlite3_open_v2(filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READONLY, nullptr);
-        else
-            rc = sqlite3_open_v2(
-                filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, nullptr);
-        ptrDb = sqlite3_ptr{ptr_tmp};
-        if(rc != 0)
+        sql = std::move(SQLite{filename_, is_system});
+        if(!sql.Valid())
         {
             dbInvalid = true;
             if(!is_system)
-            {
                 MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_);
-            }
             else
-            {
                 MIOPEN_LOG_W("Unable to read system database file:" + filename_ +
                              " Performance may degrade");
-            }
         }
         else
             dbInvalid = false;
@@ -236,27 +266,16 @@ class SQLiteBase
     static Derived&
     GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu);
     // TODO: Fix this for the overhead of having fields per record
-    using SQLRes_t = std::vector<std::unordered_map<std::string, std::string>>;
-
-    static int find_callback(void* _res, int argc, char** argv, char** azColName)
-    {
-        SQLRes_t* res = static_cast<SQLRes_t*>(_res);
-        std::unordered_map<std::string, std::string> record;
-        for(auto i               = 0; i < argc; i++)
-            record[azColName[i]] = (argv[i] != nullptr) ? argv[i] : "NULL";
-        res->push_back(record);
-        return 0;
-    }
 
     inline auto CheckTableColumns(const std::string& tableName,
                                   const std::vector<std::string>& goldenList) const
     {
         const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");";
-        SQLRes_t cfg_res;
+        SQLite::result_type cfg_res;
         {
             const auto lock = shared_lock(lock_file, GetLockTimeout());
             MIOPEN_VALIDATE_LOCK(lock);
-            SQLExec(sql_cfg_fds, cfg_res);
+            cfg_res = sql.Exec(sql_cfg_fds);
         }
         std::vector<std::string> cfg_fds(cfg_res.size());
         std::transform(
@@ -277,107 +296,6 @@ class SQLiteBase
         return AllFound;
     }
 
-    inline auto SQLExec(const std::string& query)
-    {
-        MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query);
-        {
-            auto rc = SQLRety([&]() {
-                return sqlite3_exec(ptrDb.get(), query.c_str(), find_callback, nullptr, nullptr);
-            });
-            if(rc != SQLITE_OK)
-            {
-                MIOPEN_LOG_I2(query);
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-                sqlite3_close(ptrDb.get());
-                return false;
-            }
-        }
-        return true;
-    }
-    inline auto SQLExec(const std::string& query, SQLRes_t& res) const
-    {
-        res.clear();
-        MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query);
-        {
-            auto rc = SQLRety([&]() {
-                return sqlite3_exec(
-                    ptrDb.get(), query.c_str(), find_callback, static_cast<void*>(&res), nullptr);
-            });
-            if(rc != SQLITE_OK)
-            {
-                MIOPEN_LOG_I2(query);
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-                sqlite3_close(ptrDb.get());
-                return false;
-            }
-        }
-        return true;
-    }
-
-    template <class F>
-    inline int SQLRety(F f) const
-    {
-        return SQLiteBase::SQLRety(f, filename);
-    }
-
-    template <class F>
-    static inline int SQLRety(F f, std::string filename)
-    {
-        auto timeout_end = std::chrono::high_resolution_clock::now() +
-                           std::chrono::seconds(30); // TODO: make configurable
-        auto tries = 0;
-        while(true)
-        {
-            int rc = f();
-            if(rc == SQLITE_BUSY)
-            {
-                MIOPEN_LOG_I2("Database" + filename + "  busy, retrying ...");
-                ++tries;
-                if(tries > 50)
-                    std::this_thread::sleep_for(std::chrono::microseconds(100));
-                else
-                    std::this_thread::yield();
-            }
-            else
-                return rc;
-            if(std::chrono::high_resolution_clock::now() > timeout_end)
-                MIOPEN_THROW("Timeout while waiting for Database: " + filename);
-        }
-    }
-
-    inline std::string SQLErrorMessage() const
-    {
-        std::string errMsg = "Internal error while accessing SQLite database: ";
-        return errMsg + sqlite3_errmsg(ptrDb.get());
-    }
-
-    auto Prepare(const std::string& query) const
-    {
-        sqlite3_stmt* ptr = nullptr;
-        MIOPEN_LOG_I2(query);
-        auto rc = sqlite3_prepare_v2(ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr);
-        if(rc != SQLITE_OK)
-        {
-            std::string err_msg = "SQLite prepare error: ";
-            MIOPEN_THROW(miopenStatusInternalError, err_msg + sqlite3_errmsg(ptrDb.get()));
-        }
-        return sqlite3_stmt_ptr{ptr};
-    }
-    auto PrepareAndBind(const std::string& query, std::vector<std::string>& values) const
-    {
-        auto stmt = Prepare(query);
-        int cnt   = 1;
-        for(auto& kinder : values)
-        {
-            auto rc = sqlite3_bind_text(
-                stmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT
-            if(rc != SQLITE_OK)
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-        }
-        MIOPEN_LOG_I2("[" << JoinStrings(values, ",") << "]");
-        return stmt;
-    }
-
     template <typename... U>
     inline auto FindRecord(U&... args)
     {
@@ -426,13 +344,12 @@ class SQLiteBase
         return reinterpret_cast<Derived*>(this)->LoadUnsafe(args...);
     }
 
-    protected:
     std::string filename;
     std::string arch;
     size_t num_cu;
     LockFile& lock_file;
-    sqlite3_ptr ptrDb = nullptr;
     bool dbInvalid;
+    SQLite sql;
 };
 
 template <typename Derived>
@@ -457,6 +374,7 @@ Derived& SQLiteBase<Derived>::GetCached(const std::string& path,
 class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
 {
     public:
+    static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0";
     SQLitePerfDb(const std::string& filename_,
                  bool is_system,
                  const std::string& arch_,
@@ -468,12 +386,12 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
         std::string clause;
         std::vector<std::string> vals;
         std::tie(clause, vals) = prob_desc.InsertQuery();
-        auto stmt = PrepareAndBind(clause, vals);
-        auto rc   = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+        auto stmt = SQLite::Statement{sql, clause, vals};
+        auto rc   = stmt.Step(sql);
         if(rc != SQLITE_DONE)
             MIOPEN_THROW(miopenStatusInternalError,
-                         "Failed to insert config: " + SQLErrorMessage());
-        auto cnt = sqlite3_changes(ptrDb.get());
+                         "Failed to insert config: " + sql.ErrorMessage());
+        auto cnt = sql.Changes();
         MIOPEN_LOG_I2(cnt << " rows updated");
     }
     template <class T>
@@ -483,20 +401,16 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
         std::vector<std::string> vals;
         std::tie(clause, vals) = prob_desc.WhereClause();
         auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );";
-        auto stmt  = PrepareAndBind(query, vals);
+        auto stmt  = SQLite::Statement{sql, query, vals};
         while(true)
         {
-            auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+            auto rc = stmt.Step(sql);
             if(rc == SQLITE_ROW)
-            {
-                auto id =
-                    std::string(reinterpret_cast<const char*>(sqlite3_column_text(stmt.get(), 0)));
-                return id;
-            }
+                return stmt.ColumnText(0);
             else if(rc == SQLITE_DONE)
                 return "";
             else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
+                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
         }
     }
     template <typename T>
@@ -518,22 +432,17 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
             "AND (arch = '" + arch + "' ) "
             "AND (num_cu = '" + std::to_string(num_cu) + "');";
         // clang-format on
-        auto stmt = PrepareAndBind(select_query, values);
+        auto stmt = SQLite::Statement{sql, select_query, values};
         DbRecord rec;
         while(true)
         {
-            auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+            auto rc = stmt.Step(sql);
             if(rc == SQLITE_ROW)
-            {
-                auto c_slvr   = sqlite3_column_text(stmt.get(), 0);
-                auto c_params = sqlite3_column_text(stmt.get(), 1);
-                rec.SetValues(std::string(reinterpret_cast<const char*>(c_slvr)),
-                              std::string(reinterpret_cast<const char*>(c_params)));
-            }
+                rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1));
             else if(rc == SQLITE_DONE)
                 break;
             else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
+                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
         }
         if(rec.GetSize() == 0)
             return boost::none;
@@ -561,14 +470,14 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
             + clause + " ) )"
             "AND solver == '" +  id + "' ;";
         // clang-format on
-        auto stmt = PrepareAndBind(query, values);
-        auto rc   = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+        auto stmt = SQLite::Statement{sql, query, values};
+        auto rc   = stmt.Step(sql);
         if(rc == SQLITE_DONE)
             return true;
         else
         {
             std::string msg = "Unable to remove database entry: ";
-            MIOPEN_LOG_E(msg + SQLErrorMessage());
+            MIOPEN_LOG_E(msg + sql.ErrorMessage());
             return false;
         }
     }
@@ -586,12 +495,12 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
             std::string clause;
             std::vector<std::string> vals;
             std::tie(clause, vals) = problem_config.InsertQuery();
-            auto stmt = PrepareAndBind(clause, vals);
-            auto rc   = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+            auto stmt = SQLite::Statement{sql, clause, vals};
+            auto rc   = stmt.Step(sql);
             if(rc != SQLITE_DONE)
                 MIOPEN_THROW(miopenStatusInternalError,
-                             "Failed to insert config: " + SQLErrorMessage());
-            auto cnt = sqlite3_changes(ptrDb.get());
+                             "Failed to insert config: " + sql.ErrorMessage());
+            auto cnt = sql.Changes();
             MIOPEN_LOG_I2(cnt << " rows updated");
         }
 
@@ -615,12 +524,12 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
             vals.push_back(params.str());
             vals.push_back(arch);
             vals.push_back(std::to_string(num_cu));
-            auto stmt = PrepareAndBind(query, vals);
-            auto rc   = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+            auto stmt = SQLite::Statement{sql, query, vals};
+            auto rc   = stmt.Step(sql);
             if(rc != SQLITE_DONE)
             {
                 MIOPEN_LOG_E("Failed to insert performance record in the database: " +
-                             SQLErrorMessage());
+                             sql.ErrorMessage());
                 return boost::none;
             }
         }
@@ -655,11 +564,11 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
             "SELECT id FROM config WHERE ( "
             + clause + " ))";
         // clang-format on
-        auto stmt = PrepareAndBind(query, values);
-        auto rc   = SQLRety([&]() { return sqlite3_step(stmt.get()); });
+        auto stmt = SQLite::Statement{sql, query, values};
+        auto rc   = stmt.Step(sql);
         if(rc != SQLITE_DONE)
         {
-            MIOPEN_LOG_E("Unable to Clear databaes entry: " + SQLErrorMessage());
+            MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage());
             return false;
         }
         else
diff --git a/src/kern_db.cpp b/src/kern_db.cpp
index 07faf5998a..b991e59574 100644
--- a/src/kern_db.cpp
+++ b/src/kern_db.cpp
@@ -44,32 +44,29 @@ KernDb::KernDb(const std::string& filename_,
       compress_fn(_compress_fn),
       decompress_fn(_decompress_fn)
 {
+    if(dbInvalid)
+    {
+        if(filename.empty())
+            MIOPEN_LOG_I("database not present");
+        else
+            MIOPEN_LOG_I(filename + " database invalid");
+        return;
+    }
     if(!is_system)
     {
         const auto lock = exclusive_lock(lock_file, GetLockTimeout());
         MIOPEN_VALIDATE_LOCK(lock);
         const std::string create_table = KernelConfig::CreateQuery();
-        if(!SQLExec(create_table))
-            MIOPEN_THROW(miopenStatusInternalError);
+        sql.Exec(create_table);
         MIOPEN_LOG_I2("Database created successfully");
     }
-    if(!dbInvalid)
+    if(!CheckTableColumns(KernelConfig::table_name(), KernelConfig::FieldNames()))
     {
-        if(!CheckTableColumns(KernelConfig::table_name(), KernelConfig::FieldNames()))
-        {
-            std::ostringstream ss;
-            ss << "Invalid fields in table: " << KernelConfig::table_name()
-               << " disabling access to " << filename;
-            MIOPEN_LOG_W(ss.str());
-            dbInvalid = true;
-        }
-    }
-    else
-    {
-        if(filename.empty())
-            MIOPEN_LOG_I("database not present");
-        else
-            MIOPEN_LOG_I(filename + " database invalid");
+        std::ostringstream ss;
+        ss << "Invalid fields in table: " << KernelConfig::table_name() << " disabling access to "
+           << filename;
+        MIOPEN_LOG_W(ss.str());
+        dbInvalid = true;
     }
 }
 
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp
index ff5f5e0704..614c129b35 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp
@@ -155,11 +155,11 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_xdlops_fp16_bfp16_gnchw_
             GridwiseBatchedGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1<
                 GridSize,
                 BlockSize,
-                Float,    // Input data type = half (fp16) or ushort (bfp16)
+                Float,    // Input data type = fp16 (fp16) or ushort (bfp16)
                 AccFloat, // Acc data type = float
-                AccFloat, // Output data type = float  (not half/ushort as this kernel uses atomic
+                AccFloat, // Output data type = float  (not fp16/ushort as this kernel uses atomic
                           // add.
-                          // No ISA for half/ushort atomic add)
+                          // No ISA for fp16/ushort atomic add)
                 decltype(wei_gemmg_gemmk_gemmm_gemmkpack_global_desc),
                 decltype(out_gemmg_gemmk_gemmn_gemmkpack_global_desc),
                 decltype(in_gemmg_gemmm_gemmn_global_desc),
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp
index 5f2e1422df..5a59bda3a2 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp
@@ -142,10 +142,10 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_xdlops_f16_bfp16_nchw_kc
         constexpr auto gridwise_gemm = GridwiseGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1<
             GridSize,
             BlockSize,
-            Float,    // Input data type = half (fp16) or ushort (bfp16)
+            Float,    // Input data type = fp16 (fp16) or ushort (bfp16)
             AccFloat, // Acc data type = float
-            AccFloat, // Output data type = float  (not half/ushort as this kernel uses atomic add.
-                      // No ISA for half/ushort atomic add)
+            AccFloat, // Output data type = float  (not fp16/ushort as this kernel uses atomic add.
+                      // No ISA for fp16/ushort atomic add)
             decltype(wei_gemmk_gemmm_gemmkpack_global_desc),
             decltype(out_gemmk_gemmn_gemmkpack_global_desc),
             decltype(in_gemmm_gemmn_global_desc),
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 2e78877f88..680fd1a7bf 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -405,12 +405,12 @@ struct GridwiseConvolutionImplicitGemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_b
                 blockwise_wei_copy.RunLoadThreadBuffer(p_wei_block_on_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with a single value in mind (e.g. float),
-                // to retain the same 2D indexes for half/bfloat16, we recast datatype
-                // from a single half to 4 packed half/2 packed bfloat16 respectively.
+                // to retain the same 2D indexes for fp16/bfloat16, we recast datatype
+                // from a single fp16 to 4 packed fp16/2 packed bfloat16 respectively.
                 const typename vector_type<Float, EPACK>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<Float, EPACK>::MemoryType*>(
                         p_wei_block_double);
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 7c85e48ed7..d76c866d0e 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -425,11 +425,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double
                 blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with vectorized data in mind (e.g. float,half4, short2),
-                // we recast datatype from a single half/bfloat16 to 4 packed half/2 packed bfloat16
+                // we recast datatype from a single fp16/bfloat16 to 4 packed fp16/2 packed bfloat16
                 // respectively.
                 const typename vector_type<Float, EPack>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<Float, EPack>::MemoryType*>(
@@ -465,11 +465,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double
                 blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on 2nd-last data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with vectorized data in mind (e.g. float,half4, short2),
-                // we recast datatype from a single half/bfloat16 to 4 packed half/2 packed bfloat16
+                // we recast datatype from a single fp16/bfloat16 to 4 packed fp16/2 packed bfloat16
                 // respectively.
                 const typename vector_type<Float, EPack>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<Float, EPack>::MemoryType*>(
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 792127a001..6380731116 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -340,12 +340,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds
                 blockwise_wei_copy.RunLoadThreadBuffer(p_wei_block_on_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with a single value in mind (e.g. float),
-                // to retain the same 2D indexes for half/bfloat16, we recast datatype
-                // from a single half to 4 packed half/2 packed bfloat16 respectively.
+                // to retain the same 2D indexes for fp16/bfloat16, we recast datatype
+                // from a single fp16 to 4 packed fp16/2 packed bfloat16 respectively.
                 const typename vector_type<Float, EPack>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<Float, EPack>::MemoryType*>(
                         p_wei_block_double);
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp
index ffa9745ed3..8f1e84ea84 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp
+++ b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp
@@ -269,11 +269,11 @@ struct GridwiseGemmTransposedANormalBNormalCFp16Bfp16_v1
                 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4),
-                // we recast datatype from a single half to 4 packed half/2 packed bfloat16
+                // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16
                 // respectively.
                 const typename vector_type<ABFloat, KPACK>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<ABFloat, KPACK>::MemoryType*>(
diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
index 19d010f9b7..e1d989302e 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
+++ b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
@@ -252,11 +252,11 @@ struct GridwiseGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1
                 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4),
-                // we recast datatype from a single half to 4 packed half/2 packed bfloat16
+                // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16
                 // respectively.
                 const typename vector_type<ABFloat, KPACK>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<ABFloat, KPACK>::MemoryType*>(
@@ -601,11 +601,11 @@ struct GridwiseBatchedGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1
                 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
-                // Vectorize the pointer to match with how half/bfloat16 datatypes are
-                // processed in gemm operation. Half type packs 4 half values while
+                // Vectorize the pointer to match with how fp16/bfloat16 datatypes are
+                // processed in gemm operation. fp16 type packs 4 fp16 values while
                 // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B
                 // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4),
-                // we recast datatype from a single half to 4 packed half/2 packed bfloat16
+                // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16
                 // respectively.
                 const typename vector_type<ABFloat, KPACK>::MemoryType* p_a_block_vec =
                     reinterpret_cast<const typename vector_type<ABFloat, KPACK>::MemoryType*>(
diff --git a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
index 99998617df..3be177cbcd 100644
--- a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
@@ -5,6 +5,8 @@
 #include "ConstantMatrixDescriptor.hpp"
 #include "math.hpp"
 
+#define WORKAROUND_SWDEV_229564 1
+
 namespace ck {
 
 enum struct mfma_instr
@@ -205,7 +207,7 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x4f16>
 
     template <index_t MPerWave, index_t NPerWave>
     __device__ void
-    run(Number<MPerWave>, Number<NPerWave>, const half* a, const half* b, float* reg_c) const
+    run(Number<MPerWave>, Number<NPerWave>, const half_t* a, const half_t* b, float* reg_c) const
     {
         static_assert((MPerWave == 64 && NPerWave == 64) || (MPerWave == 32 && NPerWave == 64) ||
                           (MPerWave == 64 && NPerWave == 32),
@@ -237,7 +239,7 @@ struct mfma_info<mfma_instr::mfma_f32_32x32x8f16>
 
     template <index_t MPerWave, index_t NPerWave>
     __device__ void
-    run(Number<MPerWave>, Number<NPerWave>, const half* a, const half* b, float* reg_c) const
+    run(Number<MPerWave>, Number<NPerWave>, const half_t* a, const half_t* b, float* reg_c) const
     {
         static_assert((MPerWave == 32 && NPerWave == 32), "unsupported xdlops gemm");
 
@@ -267,7 +269,7 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x16f16>
 
     template <index_t MPerWave, index_t NPerWave>
     __device__ void
-    run(Number<MPerWave>, Number<NPerWave>, const half* a, const half* b, float* reg_c) const
+    run(Number<MPerWave>, Number<NPerWave>, const half_t* a, const half_t* b, float* reg_c) const
     {
         static_assert((MPerWave == 16 && NPerWave == 16), "unsupported xdlops gemm");
 
@@ -297,7 +299,7 @@ struct mfma_info<mfma_instr::mfma_f32_16x16x4f16>
 
     template <index_t MPerWave, index_t NPerWave>
     __device__ void
-    run(Number<MPerWave>, Number<NPerWave>, const half* a, const half* b, float* reg_c) const
+    run(Number<MPerWave>, Number<NPerWave>, const half_t* a, const half_t* b, float* reg_c) const
     {
         static_assert((MPerWave == 16 && NPerWave == 64) || (MPerWave == 64 && NPerWave == 16),
                       "unsupported xdlops gemm");
@@ -328,7 +330,7 @@ struct mfma_info<mfma_instr::mfma_f32_4x4x4f16>
 
     template <index_t MPerWave, index_t NPerWave>
     __device__ void
-    run(Number<MPerWave>, Number<NPerWave>, const half* a, const half* b, float* reg_c) const
+    run(Number<MPerWave>, Number<NPerWave>, const half_t* a, const half_t* b, float* reg_c) const
     {
         static_assert((MPerWave == 4 || MPerWave == 8) && NPerWave == 64,
                       "unsupported xdlops gemm");
@@ -616,55 +618,55 @@ struct XdlopsGemm_t
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 64, 64>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 64, 64>()
     {
         return mfma_info<mfma_instr::mfma_f32_32x32x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 64, 32>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 64, 32>()
     {
         return mfma_info<mfma_instr::mfma_f32_32x32x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 32, 64>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 32, 64>()
     {
         return mfma_info<mfma_instr::mfma_f32_32x32x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 32, 32>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 32, 32>()
     {
         return mfma_info<mfma_instr::mfma_f32_32x32x8f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 16, 16>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 16, 16>()
     {
         return mfma_info<mfma_instr::mfma_f32_16x16x16f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 16, 64>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 16, 64>()
     {
         return mfma_info<mfma_instr::mfma_f32_16x16x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 64, 16>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 64, 16>()
     {
         return mfma_info<mfma_instr::mfma_f32_16x16x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 4, 64>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 4, 64>()
     {
         return mfma_info<mfma_instr::mfma_f32_4x4x4f16>{};
     }
 
     template <>
-    __device__ static constexpr auto GetMFMAInfo<half, 8, 64>()
+    __device__ static constexpr auto GetMFMAInfo<half_t, 8, 64>()
     {
         return mfma_info<mfma_instr::mfma_f32_4x4x4f16>{};
     }
@@ -851,6 +853,9 @@ struct XdlopsGemm_t
             auto pa = reinterpret_cast<const data_type*>(&a);
             auto pb = reinterpret_cast<const data_type*>(&b);
 
+#if WORKAROUND_SWDEV_229564
+#pragma unroll
+#endif
             for(index_t k = 0; k < K; ++k)
             {
                 constexpr index_t nxdlops = sizeof(FloatA) / (mfma_type.k * sizeof(data_type));
@@ -883,6 +888,9 @@ struct XdlopsGemm_t
             constexpr index_t nxdlops =
                 (sizeof(FloatA) * mfma_type.num_input_blks) / (mfma_type.k * sizeof(data_type));
 
+#if WORKAROUND_SWDEV_229564
+#pragma unroll
+#endif
             for(index_t k = 0; k < K; k += mfma_type.num_input_blks)
             {
                 for(index_t i = 0; i < nxdlops; ++i, pa += mfma_type.k, pb += mfma_type.k)
diff --git a/src/kernels/composable_kernel/include/utility/float_type.hpp b/src/kernels/composable_kernel/include/utility/float_type.hpp
index 94edeebc3c..fc4094fe12 100644
--- a/src/kernels/composable_kernel/include/utility/float_type.hpp
+++ b/src/kernels/composable_kernel/include/utility/float_type.hpp
@@ -11,6 +11,7 @@ typedef float float16_t __attribute__((ext_vector_type(16)));
 typedef float float32_t __attribute__((ext_vector_type(32)));
 
 // float16
+typedef _Float16 half_t;
 typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
 typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
 typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
@@ -85,37 +86,37 @@ struct vector_type<float, 4>
 };
 
 template <>
-struct vector_type<half, 1>
+struct vector_type<half_t, 1>
 {
-    using MemoryType = half;
+    using MemoryType = half_t;
 
     template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
     {
         static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
     }
 };
 
 template <>
-struct vector_type<half, 2>
+struct vector_type<half_t, 2>
 {
     using MemoryType = half2_t;
 
     union DataType
     {
         MemoryType vector;
-        half scalar[2];
+        half_t scalar[2];
     };
 
     template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
     {
         static_assert(I < 2, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
     }
 
-    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
     {
         DataType data;
         data.scalar[0] = s0;
@@ -125,24 +126,24 @@ struct vector_type<half, 2>
 };
 
 template <>
-struct vector_type<half, 4>
+struct vector_type<half_t, 4>
 {
     using MemoryType = half4_t;
 
     union DataType
     {
         MemoryType vector;
-        half scalar[4];
+        half_t scalar[4];
     };
 
     template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
     {
         static_assert(I < 4, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
     }
 
-    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3)
     {
         DataType data;
         data.scalar[0] = s0;
@@ -154,21 +155,21 @@ struct vector_type<half, 4>
 };
 
 template <>
-struct vector_type<half, 8>
+struct vector_type<half_t, 8>
 {
     using MemoryType = half8_t;
 
     union DataType
     {
         MemoryType vector;
-        half scalar[8];
+        half_t scalar[8];
     };
 
     template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
     {
         static_assert(I < 8, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
     }
 };
 
@@ -294,8 +295,8 @@ struct inner_product_with_conversion
 
     __device__ T operator()(half2_t a, half2_t b) const
     {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
 
         T acc = 0;
         for(index_t v = 0; v < 2; ++v)
@@ -308,8 +309,8 @@ struct inner_product_with_conversion
 
     __device__ T operator()(half4_t a, half4_t b) const
     {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
 
         T acc = 0;
         for(index_t v = 0; v < 4; ++v)
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
index 467f14baac..d2f2604fa8 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
+++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp
@@ -230,15 +230,15 @@ extern "C" __global__
     gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2
     // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension
-    // It requires output blob to be of float as no atomic add exists for half/ushort
+    // It requires output blob to be of float as no atomic add exists for fp16/ushort
     constexpr auto gridwise_conv =
         GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_gnchw_gkcyx_gnkhw_lds_double_buffer<
             GridSize,
             BlockSize,
-            FLOAT,       // Input data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
             FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            float, // Output data type = float  (not half/ushort) as no atomic add ISA exists for
-                   // half/ushort.
+            float, // Output data type = float  (not fp16/ushort) as no atomic add ISA exists for
+                   // fp16/ushort.
             decltype(in_gnchw_desc),
             decltype(wei_gkcyx_desc),
             decltype(out_gnkhw_desc),
@@ -270,7 +270,7 @@ extern "C" __global__
             GemmBBlockCopySrcDataPerRead_GemmN,
             GemmBBlockCopyDstDataPerWrite_GemmKPACK,
             dir>{};
-    // Output blob is cast to float as no atomic add exists for half/ushort
+    // Output blob is cast to float as no atomic add exists for fp16/ushort
     gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast<FLOAT_ACCUM*>(p_out_global));
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2
     // Forward data doesn't use any atomic add so output blob remains of the same type
@@ -279,9 +279,9 @@ extern "C" __global__
         GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_gnchw_gkcyx_gnkhw_lds_double_buffer<
             GridSize,
             BlockSize,
-            FLOAT,       // Input data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
             FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            FLOAT,       // Output data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Output data type = fp16 (fp16) or ushort (bfp16)
             decltype(in_gnchw_desc),
             decltype(wei_gkcyx_desc),
             decltype(out_gnkhw_desc),
diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
index 194295647c..6c7ab7b233 100644
--- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
@@ -229,15 +229,15 @@ extern "C" __global__
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2
 
     // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension
-    // It requires output blob to be of float as no atomic add exists for half/ushort
+    // It requires output blob to be of float as no atomic add exists for fp16/ushort
     constexpr auto gridwise_conv =
         GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer<
             GridSize,
             BlockSize,
-            FLOAT,       // Input data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
             FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            float, // Output data type = float  (not half/ushort) as no atomic add ISA exists for
-                   // half/ushort.
+            float, // Output data type = float  (not fp16/ushort) as no atomic add ISA exists for
+                   // fp16/ushort.
             decltype(in_nchw_desc),
             decltype(wei_kcyx_desc),
             decltype(out_nkhw_desc),
@@ -270,7 +270,7 @@ extern "C" __global__
             GemmBBlockCopyDstDataPerWrite_GemmKPACK,
             dir>{};
 
-    // Output blob is cast to float as no atomic add exists for half/ushort
+    // Output blob is cast to float as no atomic add exists for fp16/ushort
     gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast<FLOAT_ACCUM*>(p_out_global));
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2
     // Forward data doesn't use any atomic add so output blob remains of the same type
@@ -287,9 +287,9 @@ extern "C" __global__
         GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer<
             GridSize,
             BlockSize,
-            FLOAT,       // Input data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
             FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            FLOAT,       // Input data type = half (fp16) or ushort (bfp16)
+            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
             decltype(in_nchw_desc),
             decltype(wei_kcyx_desc),
             decltype(out_nkhw_desc),
diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h
index 355a9409ba..1074e7ce2b 100644
--- a/src/kernels/float_types.h
+++ b/src/kernels/float_types.h
@@ -36,7 +36,7 @@
 
 #if MIOPEN_USE_FP16 == 1
 #ifdef __HIP_PLATFORM_HCC__
-#define FLOAT half
+#define FLOAT _Float16
 #define FLOAT_ACCUM float
 #else
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
diff --git a/src/ocl/gcn_asm_utils.cpp b/src/ocl/gcn_asm_utils.cpp
index d1b2b65df1..c1456f71e6 100644
--- a/src/ocl/gcn_asm_utils.cpp
+++ b/src/ocl/gcn_asm_utils.cpp
@@ -48,6 +48,12 @@
 #include <unistd.h>
 #endif // __linux__
 
+/// SWDEV-220166: hcc reports unknown target instead of amdgpu but reports "HCC" at least.
+#define WORKAROUND_SWDEV_220166 1
+/// SWDEV-233338: hip-clang reports unknown target instead of amdgpu.
+/// \todo Try to assemble AMD GCN source?
+#define WORKAROUND_SWDEV_233338 1
+
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_EXPERIMENTAL_GCN_ASM_PATH)
 
 static const char option_no_co_v3[] = "-mno-code-object-v3";
@@ -81,10 +87,13 @@ bool ValidateGcnAssemblerImpl()
     const auto path = GetGcnAssemblerPath();
     if(path.empty())
     {
+        MIOPEN_LOG_NQE("Path to assembler is not provided. Expect performance degradation.");
         return false;
     }
     if(!std::ifstream(path).good())
     {
+        MIOPEN_LOG_NQE("Wrong path to assembler: '" << path
+                                                    << "'. Expect performance degradation.");
         return false;
     }
 
@@ -100,23 +109,28 @@ bool ValidateGcnAssemblerImpl()
     std::string clang_result_line;
     std::getline(clang_stdout, clang_result_line);
     MIOPEN_LOG_NQI2(clang_result_line);
+
+#if WORKAROUND_SWDEV_220166
     if(clang_result_line.find("HCC") != std::string::npos)
-        // Temporary fix for SWDEV-220166 which causes clang to report unknown
-        // architecture for AMD GCN
         return true;
-    else if(clang_result_line.find("clang") != std::string::npos)
+#endif
+
+    if(clang_result_line.find("clang") != std::string::npos)
     {
         while(!clang_stdout.eof())
         {
             std::getline(clang_stdout, clang_result_line);
             MIOPEN_LOG_NQI2(clang_result_line);
-            if(clang_result_line.find("Target: ") != std::string::npos)
-            {
-                return clang_result_line.find("amdgcn") != std::string::npos;
-            }
+            if(clang_result_line.find("Target: ") != std::string::npos &&
+               clang_result_line.find("amdgcn") != std::string::npos)
+                return true;
         }
+#if WORKAROUND_SWDEV_233338
+        return true;
+#endif
     }
 #endif // __linux__
+    MIOPEN_LOG_NQE("Specified assembler does not support AMDGPU. Expect performance degradation.");
     return false;
 }
 
diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp
index 21a5ac7643..27d7a76c13 100644
--- a/src/ocl/utilocl.cpp
+++ b/src/ocl/utilocl.cpp
@@ -62,17 +62,17 @@ float Im2d2ColGPU(Handle& handle,
 
     // clang-format off
     std::string network_config =
-        "c" + std::to_string(c) + 
-        "i" + std::to_string(in_h) + 
-        "_" + std::to_string(in_w) + 
-        "w" + std::to_string(wei_h) + 
-        "_" + std::to_string(wei_w) + 
-        "p" + std::to_string(pad_h) + 
-        "_" + std::to_string(pad_w) + 
-        "s" + std::to_string(stride_h) + 
-        "_" + std::to_string(stride_w) + 
-        "d" + std::to_string(dilation_h) + 
-        "_" + std::to_string(dilation_w) + 
+        "c" + std::to_string(c) +
+        "i" + std::to_string(in_h) +
+        "_" + std::to_string(in_w) +
+        "w" + std::to_string(wei_h) +
+        "_" + std::to_string(wei_w) +
+        "p" + std::to_string(pad_h) +
+        "_" + std::to_string(pad_w) +
+        "s" + std::to_string(stride_h) +
+        "_" + std::to_string(stride_w) +
+        "d" + std::to_string(dilation_h) +
+        "_" + std::to_string(dilation_w) +
         "t" + std::to_string(type);
     // clang-format on
 
@@ -249,22 +249,22 @@ float Im3d2ColGPU(Handle& handle,
 
     // clang-format off
     std::string network_config =
-        "c" + std::to_string(im_c) + 
-        "i" + std::to_string(im_d) + 
-        "_" + std::to_string(im_h) + 
-        "_" + std::to_string(im_w) + 
-        "w" + std::to_string(wei_d) + 
-        "_" + std::to_string(wei_h) + 
-        "_" + std::to_string(wei_w) + 
-        "p" + std::to_string(pad_d) + 
-        "_" + std::to_string(pad_h) + 
-        "_" + std::to_string(pad_w) + 
-        "s" + std::to_string(stride_d) + 
+        "c" + std::to_string(im_c) +
+        "i" + std::to_string(im_d) +
+        "_" + std::to_string(im_h) +
+        "_" + std::to_string(im_w) +
+        "w" + std::to_string(wei_d) +
+        "_" + std::to_string(wei_h) +
+        "_" + std::to_string(wei_w) +
+        "p" + std::to_string(pad_d) +
+        "_" + std::to_string(pad_h) +
+        "_" + std::to_string(pad_w) +
+        "s" + std::to_string(stride_d) +
         "_" + std::to_string(stride_h) +
-        "_" + std::to_string(stride_w) + 
-        "d" + std::to_string(dilation_d) + 
-        "_" + std::to_string(dilation_h) + 
-        "_" + std::to_string(dilation_w) + 
+        "_" + std::to_string(stride_w) +
+        "d" + std::to_string(dilation_d) +
+        "_" + std::to_string(dilation_h) +
+        "_" + std::to_string(dilation_w) +
         "t" + std::to_string(type);
     // clang-format on
 
@@ -364,17 +364,17 @@ float Col2Im2dGPU(Handle& handle,
 
     // clang-format off
     std::string network_config =
-        "c" + std::to_string(in_c) + 
+        "c" + std::to_string(in_c) +
         "in_h" + std::to_string(in_h) +
-        "in_w" + std::to_string(in_w) + 
-        "y" + std::to_string(wei_h) + 
-        "x" + std::to_string(wei_w) + 
-        "p" + std::to_string(pad_h) + 
-        "q" + std::to_string(pad_w) + 
-        "u" + std::to_string(stride_h) + 
-        "v" + std::to_string(stride_w) + 
-        "l" + std::to_string(dilation_h) + 
-        "j" + std::to_string(dilation_w) + 
+        "in_w" + std::to_string(in_w) +
+        "y" + std::to_string(wei_h) +
+        "x" + std::to_string(wei_w) +
+        "p" + std::to_string(pad_h) +
+        "q" + std::to_string(pad_w) +
+        "u" + std::to_string(stride_h) +
+        "v" + std::to_string(stride_w) +
+        "l" + std::to_string(dilation_h) +
+        "j" + std::to_string(dilation_w) +
         "t" + std::to_string(type);
     // clang-format on
 
@@ -458,22 +458,22 @@ float Col2Im3dGPU(Handle& handle,
 
     // clang-format off
     std::string network_config =
-        "c" + std::to_string(in_c) + 
-        "i" + std::to_string(in_d) + 
-        "_" + std::to_string(in_h) + 
-        "_" + std::to_string(in_w) + 
-        "w" + std::to_string(wei_d) + 
-        "_" + std::to_string(wei_h) + 
-        "_" + std::to_string(wei_w) + 
-        "p" + std::to_string(pad_d) + 
-        "_" + std::to_string(pad_h) + 
-        "_" + std::to_string(pad_w) + 
-        "s" + std::to_string(stride_d) + 
+        "c" + std::to_string(in_c) +
+        "i" + std::to_string(in_d) +
+        "_" + std::to_string(in_h) +
+        "_" + std::to_string(in_w) +
+        "w" + std::to_string(wei_d) +
+        "_" + std::to_string(wei_h) +
+        "_" + std::to_string(wei_w) +
+        "p" + std::to_string(pad_d) +
+        "_" + std::to_string(pad_h) +
+        "_" + std::to_string(pad_w) +
+        "s" + std::to_string(stride_d) +
         "_" + std::to_string(stride_h) +
-        "_" + std::to_string(stride_w) + 
-        "d" + std::to_string(dilation_d) + 
-        "_" + std::to_string(dilation_h) + 
-        "_" + std::to_string(dilation_w) + 
+        "_" + std::to_string(stride_w) +
+        "d" + std::to_string(dilation_d) +
+        "_" + std::to_string(dilation_h) +
+        "_" + std::to_string(dilation_w) +
         "t" + std::to_string(type);
     // clang-format on
 
@@ -960,7 +960,7 @@ float transpose_NCHW2Vec(Handle& handle,
         lens.begin() + 2, lens.end(), std::size_t(1), std::multiplies<std::size_t>());
 
     // clang-format off
-    std::string network_config = 
+    std::string network_config =
         "n" + std::to_string(n) +
         "c" + std::to_string(c) +
         "hw" + std::to_string(hw) +
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
index 6078870e4a..523cbf58f1 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
@@ -23,12 +23,15 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
+
 #include "implicitgemm_util.hpp"
 
+#include <cstddef>
+
 namespace miopen {
 namespace solver {
 
@@ -882,6 +885,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1::GetSolution(
             std::to_string(GemmBBlockCopyDstDataPerWrite_GemmKPACK);
     }
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
index 37ca8c685f..4f6e8dabb8 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
@@ -23,12 +23,15 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
+
 #include "implicitgemm_util.hpp"
 
+#include <cstddef>
+
 namespace miopen {
 namespace solver {
 
@@ -283,6 +286,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution(
             std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmKPACK);
     }
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
index f6651060e9..48066d0b23 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
@@ -23,13 +23,16 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
-#include <numeric>
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
+
 #include "implicitgemm_util.hpp"
 
+#include <cstddef>
+#include <numeric>
+
 namespace miopen {
 namespace solver {
 
@@ -446,9 +449,9 @@ bool PerformanceImplicitGemmBwdDataV4R1::IsValidValue() const
 {
     // clang-format off
     return IsTwoPower<64, 256>(BlockSize) &&
-           IsTwoPower<32, 128>(GemmMPerBlock) && 
+           IsTwoPower<32, 128>(GemmMPerBlock) &&
            IsTwoPower<32, 128>(GemmNPerBlock) &&
-           IsTwoPower<4, 16>(GemmKPerBlock) && 
+           IsTwoPower<4, 16>(GemmKPerBlock) &&
            IsTwoPower<2, 4>(GemmMPerThread) &&
            IsTwoPower<2, 4>(GemmNPerThread);
     // clang-format on
@@ -956,6 +959,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1::GetSolution(
         }
     }
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     return result;
 }
 
diff --git a/src/solver/conv_hip_implicit_gemm_v4.cpp b/src/solver/conv_hip_implicit_gemm_v4.cpp
index 0b4b6388a6..d33aa7a343 100644
--- a/src/solver/conv_hip_implicit_gemm_v4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4.cpp
@@ -24,13 +24,15 @@
  *
  *******************************************************************************/
 
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/solver.hpp>
+
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
-#include "miopen/stringutils.hpp"
-#include "implicitgemm_util.hpp"
-#include "miopen/implicitgemm_params.hpp"
-#include "miopen/hip_build_utils.hpp"
+#include <miopen/stringutils.hpp>
+#include <miopen/implicitgemm_params.hpp>
+#include <miopen/hip_build_utils.hpp>
+
 #include "implicitgemm_util.hpp"
 
 #define WORKAROUND_ISSUE_2174_2222_2224_2243 1
@@ -357,6 +359,9 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx,
         ctx.general_compile_options;
     // clang-format on
 
+    if(ctx.direction.IsForward() || ctx.direction.IsBackwardData())
+        result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
+
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_v4r1.cpp
index 166fb0e9b3..7aa377af0f 100644
--- a/src/solver/conv_hip_implicit_gemm_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4r1.cpp
@@ -23,12 +23,17 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+
+#include <miopen/solver.hpp>
+
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
+
 #include "implicitgemm_util.hpp"
 
+#include <cstddef>
+
 namespace miopen {
 namespace solver {
 
@@ -293,7 +298,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c
     const auto InBlockCopyDstDataPerWrite_EPack  = !ctx.IsFp32() ? GetEPackLength(ctx, false) : 1;
 
     // clang-format off
-    construction_parameters.comp_options = 
+    construction_parameters.comp_options =
         std::string(" -std=c++14 ") +
         std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(n) +
         std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(k) +
@@ -359,6 +364,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c
             std::to_string(WeiBlockCopyDstDataPerWrite_EPack);
     }
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     result.construction_params.push_back(construction_parameters);
     return result;
 }
@@ -492,7 +498,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
     InBlockCopySrcDataPerRead_B = ctx.kernel_stride_w > 1 ? 1 : InBlockCopySrcDataPerRead_B;
 
     // clang-format off
-    construction_parameters.comp_options = 
+    construction_parameters.comp_options =
         std::string(" -std=c++14 ") +
         std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(n) +
         std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(k) +
@@ -535,7 +541,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c
         std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E=") + std::to_string(config.WeiBlockCopyClusterLengths_E) +
         std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K=") + std::to_string(config.WeiBlockCopyClusterLengths_K) +
         std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) +
-        std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) + 
+        std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) +
         std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx)? '1' : '0') +
         std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
         ctx.general_compile_options;
diff --git a/src/solver/conv_hip_implicit_gemm_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_v4r4.cpp
index 9b0c4295e1..af7f0f8261 100644
--- a/src/solver/conv_hip_implicit_gemm_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4r4.cpp
@@ -23,12 +23,15 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
+
 #include "implicitgemm_util.hpp"
 
+#include <cstddef>
+
 namespace miopen {
 namespace solver {
 
@@ -769,6 +772,7 @@ ConvSolution ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& c
 
     // clang-format on
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp
index 090419b0cf..e8418a60c8 100644
--- a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp
@@ -24,12 +24,15 @@
  *
  *******************************************************************************/
 
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/solver.hpp>
+
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
-#include "miopen/stringutils.hpp"
+#include <miopen/stringutils.hpp>
+#include <miopen/implicitgemm_params.hpp>
+
 #include "implicitgemm_util.hpp"
-#include "miopen/implicitgemm_params.hpp"
 
 namespace miopen {
 namespace solver {
@@ -320,6 +323,9 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx,
         }
     }
 
+    if(ctx.direction.IsForward() || ctx.direction.IsBackwardData())
+        result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
+
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp
index 93a761b2f7..d1d5d21e52 100644
--- a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp
@@ -24,13 +24,15 @@
  *
  *******************************************************************************/
 
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/env.hpp>
 #include <miopen/generic_search.hpp>
-#include "miopen/stringutils.hpp"
+#include <miopen/handle.hpp>
+#include <miopen/implicitgemm_params.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/stringutils.hpp>
+
 #include "implicitgemm_util.hpp"
-#include "miopen/implicitgemm_params.hpp"
-#include <miopen/env.hpp>
 
 namespace miopen {
 namespace solver {
@@ -531,6 +533,7 @@ ConvSolution ConvHipImplicitGemmV4R4GenXdlopsFwdFp32::GetSolution(
         ctx.general_compile_options;
     // clang-format on
 
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp
index 20ee060316..a47d4bef06 100644
--- a/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp
@@ -24,14 +24,17 @@
  *
  *******************************************************************************/
 
-#include "miopen/solver.hpp"
-#include "miopen/handle.hpp"
+#include <miopen/solver.hpp>
+
+#include <miopen/conv/invokers/impl_gemm.hpp>
+#include <miopen/handle.hpp>
 #include <miopen/generic_search.hpp>
-#include "miopen/stringutils.hpp"
-#include "implicitgemm_util.hpp"
-#include "miopen/implicitgemm_params.hpp"
+#include <miopen/stringutils.hpp>
+#include <miopen/implicitgemm_params.hpp>
 #include <miopen/env.hpp>
 
+#include "implicitgemm_util.hpp"
+
 namespace miopen {
 namespace solver {
 
@@ -174,7 +177,7 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx,
     }
 
     // clang-format off
-    construction_parameters.comp_options += 
+    construction_parameters.comp_options +=
         std::string(" -std=c++14 ") +
         std::string(" -DCK_PARAM_PROBLEM_DIRECTION=") + std::to_string(static_cast<int>(direction)) +
         std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(ctx.batch_sz) +
@@ -195,17 +198,20 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx,
         std::string(" -DCK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B=") + std::to_string(config.InBlockCopyClusterLengths_B) +
         std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E=") + std::to_string(config.WeiBlockCopyClusterLengths_E) +
         std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K=") + std::to_string(config.WeiBlockCopyClusterLengths_K) +
-        std::string(" -DCK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B=") + std::to_string(InBlockCopyDataPerAccess_B) + 
-        std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) + 
-        std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(WeiBlockCopyDstDataPerWrite_K) + 
-        std::string(" -DCK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B=") + std::to_string(OutThreadCopyDataPerAccess_B) + 
-        std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, true)) + 
+        std::string(" -DCK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B=") + std::to_string(InBlockCopyDataPerAccess_B) +
+        std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) +
+        std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(WeiBlockCopyDstDataPerWrite_K) +
+        std::string(" -DCK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B=") + std::to_string(OutThreadCopyDataPerAccess_B) +
+        std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, true)) +
         std::string(" -DCK_USE_AMD_XDLOPS=") + (IsXdlopsSupport(ctx) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + (miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? '1' : '0') +
         std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') +
         ctx.general_compile_options;
     // clang-format on
 
+    if(ctx.direction.IsForward() || ctx.direction.IsBackwardData())
+        result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
+
     result.construction_params.push_back(construction_parameters);
     return result;
 }
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
index d71c485de7..a05e9b6396 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
@@ -472,6 +472,8 @@ bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicableBase(const ConvolutionContext& p
         return false;
     if(!params.Is2d())
         return false;
+    if(params.IsAsymmetricPadH() || params.IsAsymmetricPadW())
+        return false;
     if(!(params.IsFp32() || params.IsFp16() || params.IsBfp16()))
         return false;
 
diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp
index 5ad9110f2e..ed2b5c42ee 100644
--- a/src/sqlite_db.cpp
+++ b/src/sqlite_db.cpp
@@ -50,19 +50,228 @@
 
 namespace miopen {
 
+class SQLite::impl
+{
+    struct SQLiteCloser
+    {
+        void operator()(sqlite3* ptr)
+        {
+            std::string filename_(sqlite3_db_filename(ptr, "main"));
+            SQLite::Retry([&]() { return sqlite3_close(ptr); }, filename_);
+        }
+    };
+
+    public:
+    impl(const std::string& filename_, bool is_system)
+    {
+        sqlite3* ptr_tmp;
+        int rc = 0;
+        if(is_system)
+            rc = sqlite3_open_v2(filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READONLY, nullptr);
+        else
+            rc = sqlite3_open_v2(
+                filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, nullptr);
+        ptrDb   = sqlite3_ptr{ptr_tmp};
+        isValid = (rc == 0);
+    }
+
+    using sqlite3_ptr = std::unique_ptr<sqlite3, SQLiteCloser>;
+    sqlite3_ptr ptrDb = nullptr;
+    bool isValid;
+};
+
+static int find_callback(void* _res, int argc, char** argv, char** azColName)
+{
+    SQLite::result_type* res = static_cast<SQLite::result_type*>(_res);
+    std::unordered_map<std::string, std::string> record;
+    for(auto i               = 0; i < argc; i++)
+        record[azColName[i]] = (argv[i] != nullptr) ? argv[i] : "NULL";
+    if(res != nullptr)
+        res->push_back(record);
+    return 0;
+}
+
+SQLite::SQLite() : pImpl(nullptr) {}
+SQLite::~SQLite()                 = default;
+SQLite::SQLite(SQLite&&) noexcept = default;
+SQLite& SQLite::operator=(SQLite&&) noexcept = default;
+SQLite::result_type SQLite::Exec(const std::string& query) const
+{
+    SQLite::result_type res;
+    MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query);
+    {
+        auto rc = Retry([&]() {
+            return sqlite3_exec(pImpl->ptrDb.get(),
+                                query.c_str(),
+                                find_callback,
+                                static_cast<void*>(&res),
+                                nullptr);
+        });
+        if(rc != SQLITE_OK)
+        {
+            MIOPEN_LOG_I2(query);
+            MIOPEN_THROW(miopenStatusInternalError, ErrorMessage());
+        }
+    }
+    return res;
+}
+
+int SQLite::Retry(std::function<int()> f, std::string filename)
+{
+    auto timeout_end = std::chrono::high_resolution_clock::now() +
+                       std::chrono::seconds(30); // TODO: make configurable
+    auto tries = 0;
+    while(true)
+    {
+        int rc = f();
+        if(rc == SQLITE_BUSY)
+        {
+            MIOPEN_LOG_I2("Database" + filename + "  busy, retrying ...");
+            ++tries;
+            if(tries > 50)
+                std::this_thread::sleep_for(std::chrono::microseconds(100));
+            else
+                std::this_thread::yield();
+        }
+        else
+            return rc;
+        if(std::chrono::high_resolution_clock::now() > timeout_end)
+            MIOPEN_THROW("Timeout while waiting for Database: " + filename);
+    }
+}
+
+int SQLite::Retry(std::function<int()> f) const
+{
+    std::string filename(sqlite3_db_filename(pImpl->ptrDb.get(), "main"));
+    return SQLite::Retry(f, filename);
+}
+
+int SQLite::Changes() const { return sqlite3_changes(pImpl->ptrDb.get()); }
+
+std::string SQLite::ErrorMessage() const
+{
+    std::string errMsg = "Internal error while accessing SQLite database: ";
+    return errMsg + sqlite3_errmsg(pImpl->ptrDb.get());
+}
+bool SQLite::Valid() const { return pImpl->isValid; }
+
+class SQLite::Statement::impl
+{
+    using sqlite3_stmt_ptr = MIOPEN_MANAGE_PTR(sqlite3_stmt*, sqlite3_finalize);
+    sqlite3_stmt_ptr Prepare(const SQLite& sql, const std::string& query)
+    {
+        sqlite3_stmt* ptr = nullptr;
+        MIOPEN_LOG_I2(query);
+        auto rc =
+            sqlite3_prepare_v2(sql.pImpl->ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr);
+        if(rc != SQLITE_OK)
+        {
+            std::string err_msg = "SQLite prepare error: ";
+            MIOPEN_THROW(miopenStatusInternalError, err_msg + sql.ErrorMessage());
+        }
+        return sqlite3_stmt_ptr{ptr};
+    }
+
+    public:
+    impl(const SQLite& sql, const std::string& query) { ptrStmt = Prepare(sql, query); }
+    impl(const SQLite& sql, const std::string& query, const std::vector<std::string>& vals)
+    {
+        ptrStmt = Prepare(sql, query);
+        int cnt = 1;
+        for(auto& kinder : vals)
+        {
+            auto rc = sqlite3_bind_text(
+                ptrStmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT
+            if(rc != SQLITE_OK)
+                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
+        }
+        MIOPEN_LOG_I2("[" << JoinStrings(vals, ",") << "]");
+    }
+
+    sqlite3_stmt_ptr ptrStmt = nullptr;
+};
+
+SQLite::SQLite(const std::string& filename_, bool is_system)
+    : pImpl{std::make_unique<impl>(filename_, is_system)}
+{
+}
+
+SQLite::Statement::Statement(const SQLite& sql, const std::string& query)
+    : pImpl{std::make_unique<impl>(sql, query)}
+{
+}
+SQLite::Statement::Statement(const SQLite& sql,
+                             const std::string& query,
+                             const std::vector<std::string>& vals)
+    : pImpl{std::make_unique<impl>(sql, query, vals)}
+{
+}
+SQLite::Statement::~Statement() = default;
+SQLite::Statement::Statement() : pImpl{nullptr} {}
+SQLite::Statement::Statement(Statement&&) noexcept = default;
+SQLite::Statement& SQLite::Statement::operator=(Statement&&) noexcept = default;
+int SQLite::Statement::Step(const SQLite& sql)
+{
+    return sql.Retry([&]() { return sqlite3_step(pImpl->ptrStmt.get()); });
+}
+std::string SQLite::Statement::ColumnText(int idx)
+{
+    size_t bytes = sqlite3_column_bytes(pImpl->ptrStmt.get(), idx);
+    return std::string{
+        reinterpret_cast<const char*>(sqlite3_column_text(pImpl->ptrStmt.get(), idx)), bytes};
+}
+
+std::string SQLite::Statement::ColumnBlob(int idx)
+{
+    auto ptr = sqlite3_column_blob(pImpl->ptrStmt.get(), idx);
+    auto sz  = sqlite3_column_bytes(pImpl->ptrStmt.get(), idx);
+    return std::string{reinterpret_cast<const char*>(ptr), static_cast<size_t>(sz)};
+}
+int64_t SQLite::Statement::ColumnInt64(int idx)
+{
+    return sqlite3_column_int64(pImpl->ptrStmt.get(), idx);
+}
+
+int SQLite::Statement::BindText(int idx, const std::string& txt)
+{
+    sqlite3_bind_text(
+        pImpl->ptrStmt.get(), idx, txt.data(), txt.size(), SQLITE_TRANSIENT); // NOLINT
+    return 0;
+}
+int SQLite::Statement::BindBlob(int idx, const std::string& blob)
+{
+    sqlite3_bind_blob(
+        pImpl->ptrStmt.get(), idx, blob.data(), blob.size(), SQLITE_TRANSIENT); // NOLINT
+    return 0;
+}
+
+int SQLite::Statement::BindInt64(int idx, const int64_t num)
+{
+    sqlite3_bind_int64(pImpl->ptrStmt.get(), idx, num);
+    return 0;
+}
+
 SQLitePerfDb::SQLitePerfDb(const std::string& filename_,
                            bool is_system,
                            const std::string& arch_,
                            const std::size_t num_cu_)
     : SQLiteBase(filename_, is_system, arch_, num_cu_)
 {
+    if(dbInvalid)
+    {
+        if(filename.empty())
+            MIOPEN_LOG_I("database not present");
+        else
+            MIOPEN_LOG_I(filename + " database invalid");
+        return;
+    }
     ProblemDescription prob_desc{conv::Direction::Forward};
     prob_desc.in_data_type      = miopenFloat;
     prob_desc.out_data_type     = miopenFloat;
     prob_desc.weights_data_type = miopenFloat;
     if(!is_system)
     {
-        SQLRes_t res;
+        SQLite::result_type res;
         const std::string create_config = prob_desc.CreateQuery();
         // clang-format off
         const std::string create_perfdb_sql =
@@ -89,14 +298,13 @@ SQLitePerfDb::SQLitePerfDb(const std::string& filename_,
                   "type = 'table' AND "
                   "(name = 'config' OR name = 'perf_db');";
             // clang-format on
-            SQLExec(check_tables, res);
+            res = sql.Exec(check_tables);
         }
         if(res.empty())
         {
             const auto lock = exclusive_lock(lock_file, GetLockTimeout());
             MIOPEN_VALIDATE_LOCK(lock);
-            if(!SQLExec(create_config + create_perfdb_sql))
-                MIOPEN_THROW(miopenStatusInternalError);
+            sql.Exec(create_config + create_perfdb_sql);
             MIOPEN_LOG_I2("Database created successfully");
         }
     }
@@ -117,7 +325,5 @@ SQLitePerfDb::SQLitePerfDb(const std::string& filename_,
             dbInvalid = true;
         }
     }
-    else
-        MIOPEN_LOG_I(filename + " database invalid");
 }
 } // namespace miopen
diff --git a/test/find_db.cpp b/test/find_db.cpp
index ef216b21e0..b40ccb776a 100644
--- a/test/find_db.cpp
+++ b/test/find_db.cpp
@@ -203,5 +203,6 @@ struct FindDbTest : test_driver
 int main(int argc, const char* argv[])
 {
     setenv("MIOPEN_LOG_LEVEL", "6", 1);
+    setenv("MIOPEN_COMPILE_PARALLEL_LEVEL", "1", 1);
     test_drive<miopen::FindDbTest>(argc, argv);
 }
diff --git a/test/sqlite_perfdb.cpp b/test/sqlite_perfdb.cpp
index 049629d716..7ce588c948 100644
--- a/test/sqlite_perfdb.cpp
+++ b/test/sqlite_perfdb.cpp
@@ -225,12 +225,16 @@ std::ostream& operator<<(std::ostream& s, const SolverData& td)
 class DbTest
 {
     public:
-    DbTest() : temp_file("miopen.tests.perfdb") {}
+    DbTest()
+        : temp_file("miopen.tests.perfdb"), db_inst{std::string(temp_file), false, "gfx906", 64}
+    {
+    }
 
     virtual ~DbTest() {}
 
     protected:
     TempFile temp_file;
+    SQLitePerfDb db_inst;
 
     static const std::array<std::pair<std::string, SolverData>, 2>& common_data()
     {
@@ -243,11 +247,10 @@ class DbTest
 
     void ClearDb(SQLitePerfDb& db) const
     {
-        auto res = db.SQLExec("delete from config; delete from perf_db;");
-        EXPECT(res);
+        db.sql.Exec("delete from config; delete from perf_db;");
     }
 
-    void ResetDb() const {}
+    void ResetDb() const { db_inst.sql.Exec("delete from config; delete from perf_db;"); }
 
     static const ProblemData& key()
     {
@@ -328,35 +331,25 @@ class SchemaTest : public DbTest
     public:
     void Run() const
     {
-        SQLitePerfDb db_inst(std::string(temp_file), false, "gfx906", 64);
-
         // check if the config and perf_db tables exist
-        SQLitePerfDb::SQLRes_t res;
-        if(db_inst.SQLExec(
-               // clang-format off
-               "SELECT name, sql "
-               "FROM sqlite_master "
-               "WHERE type='table' "
-                 "AND name = 'config';"
-               // clang-format on
-               ,
-               res))
-            EXPECT(res.size() == 1);
-        else
-            EXPECT(false);
-        if(db_inst.SQLExec(
-               // clang-format off
-               "SELECT name, sql "
-               "FROM sqlite_master "
-               "WHERE type='table' "
-                 "AND name = 'perf_db';"
-               // clang-format on
-               ,
-               res))
-
-            EXPECT(res.size() == 1);
-        else
-            EXPECT(false);
+        SQLite::result_type res = db_inst.sql.Exec(
+            // clang-format off
+                "SELECT name, sql "
+                "FROM sqlite_master "
+                "WHERE type='table' "
+                "AND name = 'config';"
+            // clang-format on
+            );
+        EXPECT(res.size() == 1);
+        res = db_inst.sql.Exec(
+            // clang-format off
+                "SELECT name, sql "
+                "FROM sqlite_master "
+                "WHERE type='table' "
+                "AND name = 'perf_db';"
+            // clang-format on
+            );
+        EXPECT(res.size() == 1);
         // TODO: check for indices
     }
 };
@@ -364,10 +357,9 @@ class SchemaTest : public DbTest
 class DbFindTest : public DbTest
 {
     public:
-    void Run() const
+    void Run()
     {
-        SQLitePerfDb db_inst(std::string(temp_file), false, "gfx906", 64);
-        ResetDb(); // redundant
+        ResetDb();
 
         const ProblemData p;
         db_inst.InsertConfig(p);
@@ -379,11 +371,11 @@ class DbFindTest : public DbTest
         const SolverData sol;
         std::ostringstream ss;
         sol.Serialize(ss);
-        EXPECT(db_inst.SQLExec(
+        db_inst.sql.Exec(
             // clang-formagt off
             "INSERT INTO perf_db(config, solver, params, arch, num_cu) "
             "VALUES( " +
-            id + ", '" + id0() + "', '" + ss.str() + "', 'gfx906', 64);"));
+            id + ", '" + id0() + "', '" + ss.str() + "', 'gfx906', 64);");
         // clang-fromat on
 
         auto sol_res = db_inst.FindRecord(p);

From d20b9c8f57ca75bd75e7b9c6700556a21ac6ec0d Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 10:23:57 +0800
Subject: [PATCH 06/18] add invoker for v4r1 xdlops bwd igemm path

---
 src/include/miopen/sqlite_db.hpp                           | 3 ---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp
index 14c861693d..8d3c61bf4d 100755
--- a/src/include/miopen/sqlite_db.hpp
+++ b/src/include/miopen/sqlite_db.hpp
@@ -400,9 +400,6 @@ class SQLiteBase
         return stmt;
     }
 
->>>>>>> 5609daf1df27ee7a7d4a5c90c28ef1dc979ffdfc
-=======
->>>>>>> develop
     template <typename... U>
     inline auto FindRecord(U&... args)
     {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index dcdbef7e89..ae79d911f5 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -23,6 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
+#include <miopen/conv/invokers/impl_gemm.hpp>
 #include <cstddef>
 #include "miopen/solver.hpp"
 #include "miopen/handle.hpp"
@@ -681,6 +682,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
             
         }
     }
+    result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx);
     return result;
 }
 

From 0bec9d48efe73a136a923388cea5df86b5b423bb Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 10:27:11 +0800
Subject: [PATCH 07/18] merge sqlite hpp 1

---
 src/include/miopen/sqlite_db.hpp | 700 -------------------------------
 1 file changed, 700 deletions(-)

diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp
index 8d3c61bf4d..e69de29bb2 100755
--- a/src/include/miopen/sqlite_db.hpp
+++ b/src/include/miopen/sqlite_db.hpp
@@ -1,700 +0,0 @@
-/*******************************************************************************
-*
-* MIT License
-*
-* Copyright (c) 2019 Advanced Micro Devices, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-#pragma once
-
-#include <miopen/db_record.hpp>
-#include <miopen/manage_ptr.hpp>
-#include <miopen/errors.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/lock_file.hpp>
-
-#include <boost/core/explicit_operator_bool.hpp>
-#include <boost/none.hpp>
-#include <boost/optional/optional.hpp>
-#include <boost/thread.hpp>
-#include <boost/thread/thread_time.hpp>
-#include "sqlite3.h"
-#include <mutex>
-#include <thread>
-
-#include <string>
-#include <chrono>
-#include <unordered_map>
-
-namespace boost {
-namespace filesystem {
-class path;
-} // namespace filesystem
-} // namespace boost
-
-namespace miopen {
-
-#define MIOPEN_VALIDATE_LOCK(lock)                       \
-    do                                                   \
-    {                                                    \
-        if(!(lock))                                      \
-            MIOPEN_THROW("Db lock has failed to lock."); \
-    } while(false)
-
-template <class Derived>
-struct SQLiteSerializable
-{
-    std::vector<std::string> FieldNames() const
-    {
-        std::vector<std::string> names;
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const std::string& value, const std::string& name) {
-                           std::ignore = value;
-                           names.push_back(name);
-                       });
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const int value, const std::string name) {
-                           std::ignore = value;
-                           names.push_back(name);
-                       });
-
-        return names;
-    }
-    std::tuple<std::string, std::vector<std::string>> WhereClause() const
-    {
-        std::vector<std::string> values;
-        std::vector<std::string> clauses;
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const std::string& value, const std::string& name) {
-                           clauses.push_back("(" + name + " = ? )");
-                           values.push_back(value);
-                       });
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const int value, const std::string name) {
-                           clauses.push_back("(" + name + " = ? )");
-                           values.push_back(std::to_string(value));
-                       });
-        std::string clause = JoinStrings(clauses, " AND ");
-        return std::make_tuple(clause, values);
-    }
-    std::tuple<std::string, std::vector<std::string>> InsertQuery() const
-    {
-        std::vector<std::string> int_names, str_names, values;
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const std::string& value, const std::string& name) {
-                           str_names.push_back(name);
-                           values.push_back(value);
-                       });
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const int value, const std::string name) {
-                           int_names.push_back(name);
-                           values.push_back(std::to_string(value));
-                       });
-        std::vector<std::string> tokens((values.size()), "?");
-        ;
-
-        std::string q = "INSERT OR IGNORE INTO " + Derived::table_name() + "( " +
-                        JoinStrings(str_names, ",") + "," + JoinStrings(int_names, ",") +
-                        " ) VALUES( " + JoinStrings(tokens, ",") + ");";
-        return std::make_tuple(q, values);
-    }
-    std::tuple<std::string, std::vector<std::string>> SelectQuery() const
-    {
-        std::string clauses;
-        std::vector<std::string> values;
-        std::tie(clauses, values) = WhereClause();
-        std::string query = "SELECT id FROM " + Derived::table_name() + " WHERE " + clauses + ";";
-        return std::make_tuple(query, values);
-    }
-
-    std::string CreateQuery() const
-    {
-        std::vector<std::string> str_fields;
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const std::string value, const std::string name) {
-                           std::ignore = value;
-                           str_fields.push_back(name);
-                       });
-        std::vector<std::string> int_fields;
-        Derived::Visit(static_cast<const Derived&>(*this),
-                       [&](const int value, const std::string name) {
-                           std::ignore = value;
-                           int_fields.push_back(name);
-                       });
-        std::ostringstream ss;
-        ss << "CREATE TABLE IF NOT EXISTS `" << Derived::table_name() << "` ("
-           << "`id` INTEGER PRIMARY KEY ASC";
-        for(auto& el : str_fields)
-            ss << ",`" << el << "` TEXT NOT NULL";
-        for(auto& el : int_fields)
-            ss << ",`" << el << "` INT NOT NULL";
-        ss << ");";
-        ss << "CREATE UNIQUE INDEX IF NOT EXISTS "
-           << "`idx_" << Derived::table_name() << "` "
-           << "ON " << Derived::table_name() << "( " << miopen::JoinStrings(str_fields, ",") << ", "
-           << miopen::JoinStrings(int_fields, ",") << " );";
-        return ss.str();
-    }
-};
-
-class SQLite
-{
-    class impl;
-    // do we need propagate const
-    std::unique_ptr<impl> pImpl;
-
-    public:
-    class Statement
-    {
-        class impl;
-        std::unique_ptr<impl> pImpl;
-
-        public:
-        Statement(const SQLite& sql, const std::string& query);
-        Statement(const SQLite& sql,
-                  const std::string& query,
-                  const std::vector<std::string>& vals);
-        Statement();
-        ~Statement();
-        Statement(Statement&&) noexcept;
-        Statement& operator=(Statement&&) noexcept;
-        Statement& operator=(const Statement&) = delete;
-        int Step(const SQLite& sql);
-        std::string ColumnText(int idx);
-        std::string ColumnBlob(int idx);
-        int64_t ColumnInt64(int idx);
-        int BindText(int idx, const std::string& txt);
-        int BindBlob(int idx, const std::string& blob);
-        int BindInt64(int idx, int64_t);
-    };
-
-    using result_type = std::vector<std::unordered_map<std::string, std::string>>;
-    SQLite();
-    SQLite(const std::string& filename_, bool is_system);
-    ~SQLite();
-    SQLite(SQLite&&) noexcept;
-    SQLite& operator=(SQLite&&) noexcept;
-    SQLite& operator=(const SQLite&) = delete;
-    bool Valid() const;
-    result_type Exec(const std::string& query) const;
-    int Changes() const;
-    int Retry(std::function<int()>) const;
-    static int Retry(std::function<int()> f, std::string filename);
-    std::string ErrorMessage() const;
-};
-
-template <typename Derived>
-class SQLiteBase
-{
-    protected:
-    using exclusive_lock = boost::unique_lock<LockFile>;
-    using shared_lock    = boost::shared_lock<LockFile>;
-    static boost::system_time GetLockTimeout()
-    {
-        return boost::get_system_time() + boost::posix_time::milliseconds(60000);
-    }
-
-    public:
-    SQLiteBase(const std::string& filename_,
-               bool is_system,
-               const std::string& arch_,
-               std::size_t num_cu_)
-        : filename(filename_),
-          arch(arch_),
-          num_cu(num_cu_),
-          lock_file(LockFile::Get(LockFilePath(filename_).c_str()))
-    {
-        MIOPEN_LOG_I2("Initializing " << (is_system ? "system" : "user") << " database file "
-                                      << filename);
-
-        if(filename.empty())
-        {
-            dbInvalid = true;
-            return;
-        }
-
-        if(!is_system && !filename.empty())
-        {
-            auto file            = boost::filesystem::path(filename_);
-            const auto directory = file.remove_filename();
-            if(directory.string().empty())
-            {
-                dbInvalid = true;
-                return;
-            }
-
-            if(!(boost::filesystem::exists(directory)))
-            {
-                if(!boost::filesystem::create_directories(directory))
-                    MIOPEN_LOG_W("Unable to create a directory: " << directory);
-                else
-                    boost::filesystem::permissions(directory, boost::filesystem::all_all);
-            }
-        }
-        sql = std::move(SQLite{filename_, is_system});
-        if(!sql.Valid())
-        {
-            dbInvalid = true;
-            if(!is_system)
-                MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_);
-            else
-                MIOPEN_LOG_W("Unable to read system database file:" + filename_ +
-                             " Performance may degrade");
-        }
-        else
-            dbInvalid = false;
-    }
-
-    static Derived&
-    GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu);
-    // TODO: Fix this for the overhead of having fields per record
-
-    inline auto CheckTableColumns(const std::string& tableName,
-                                  const std::vector<std::string>& goldenList) const
-    {
-        const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");";
-        SQLite::result_type cfg_res;
-        {
-            const auto lock = shared_lock(lock_file, GetLockTimeout());
-            MIOPEN_VALIDATE_LOCK(lock);
-            cfg_res = sql.Exec(sql_cfg_fds);
-        }
-        std::vector<std::string> cfg_fds(cfg_res.size());
-        std::transform(
-            cfg_res.begin(), cfg_res.end(), cfg_fds.begin(), [](auto row) { return row["name"]; });
-        // search in the golden vector
-        bool AllFound = true;
-        for(auto& goldenName : goldenList)
-        {
-            if(std::find(cfg_fds.begin(), cfg_fds.end(), goldenName) == cfg_fds.end())
-            {
-                AllFound = false;
-                std::ostringstream ss;
-                ss << "Field " << goldenName << " not found in table: " << tableName;
-                MIOPEN_LOG_I2(ss.str());
-                // break; Not breaking to enable logging of all missing fields.
-            }
-        }
-        return AllFound;
-    }
-
-    inline auto SQLExec(const std::string& query)
-    {
-        MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query);
-        {
-            auto rc = SQLRety([&]() {
-                return sqlite3_exec(ptrDb.get(), query.c_str(), find_callback, nullptr, nullptr);
-            });
-            if(rc != SQLITE_OK)
-            {
-                MIOPEN_LOG_I2(query);
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-                sqlite3_close(ptrDb.get());
-                return false;
-            }
-        }
-        return true;
-    }
-    inline auto SQLExec(const std::string& query, SQLRes_t& res) const
-    {
-        res.clear();
-        MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query);
-        {
-            auto rc = SQLRety([&]() {
-                return sqlite3_exec(
-                    ptrDb.get(), query.c_str(), find_callback, static_cast<void*>(&res), nullptr);
-            });
-            if(rc != SQLITE_OK)
-            {
-                MIOPEN_LOG_I2(query);
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-                sqlite3_close(ptrDb.get());
-                return false;
-            }
-        }
-        return true;
-    }
-
-    template <class F>
-    inline int SQLRety(F f) const
-    {
-        return SQLiteBase::SQLRety(f, filename);
-    }
-
-    template <class F>
-    static inline int SQLRety(F f, std::string filename)
-    {
-        auto timeout_end = std::chrono::high_resolution_clock::now() +
-                           std::chrono::seconds(30); // TODO: make configurable
-        auto tries = 0;
-        while(true)
-        {
-            int rc = f();
-            if(rc == SQLITE_BUSY)
-            {
-                MIOPEN_LOG_I2("Database" + filename + "  busy, retrying ...");
-                ++tries;
-                if(tries > 50)
-                    std::this_thread::sleep_for(std::chrono::microseconds(100));
-                else
-                    std::this_thread::yield();
-            }
-            else
-                return rc;
-            if(std::chrono::high_resolution_clock::now() > timeout_end)
-                MIOPEN_THROW("Timeout while waiting for Database: " + filename);
-        }
-    }
-
-    inline std::string SQLErrorMessage() const
-    {
-        std::ostringstream ss;
-        ss << "Internal error while accessing SQLite database: ";
-        ss << sqlite3_errstr(sqlite3_errcode(ptrDb.get())) << ":";
-        ss << sqlite3_errmsg(ptrDb.get());
-        return ss.str();
-    }
-
-    auto Prepare(const std::string& query) const
-    {
-        sqlite3_stmt* ptr = nullptr;
-        MIOPEN_LOG_I2(query);
-        auto rc = sqlite3_prepare_v2(ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr);
-        if(rc != SQLITE_OK)
-        {
-            std::string err_msg = "SQLite prepare error: ";
-            MIOPEN_THROW(miopenStatusInternalError, err_msg + sqlite3_errmsg(ptrDb.get()));
-        }
-        return sqlite3_stmt_ptr{ptr};
-    }
-    auto PrepareAndBind(const std::string& query, std::vector<std::string>& values) const
-    {
-        auto stmt = Prepare(query);
-        int cnt   = 1;
-        for(auto& kinder : values)
-        {
-            auto rc = sqlite3_bind_text(
-                stmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT
-            if(rc != SQLITE_OK)
-                MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage());
-        }
-        MIOPEN_LOG_I2("[" << JoinStrings(values, ",") << "]");
-        return stmt;
-    }
-
-    template <typename... U>
-    inline auto FindRecord(U&... args)
-    {
-        const auto lock = shared_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->FindRecordUnsafe(args...);
-    }
-
-    template <typename... U>
-    inline auto RemoveRecord(U&... args)
-    {
-        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->RemoveRecordUnsafe(args...);
-    }
-
-    template <typename... U>
-    inline auto StoreRecord(U&... args)
-    {
-        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->StoreRecordUnsafe(args...);
-    }
-
-    template <typename... U>
-    inline auto Remove(const U&... args)
-    {
-        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->RemoveUnsafe(args...);
-    }
-
-    template <typename... U>
-    inline auto Update(const U&... args)
-    {
-        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->UpdateUnsafe(args...);
-    }
-
-    template <typename... U>
-    inline auto Load(U&&... args)
-    {
-        const auto lock = shared_lock(lock_file, GetLockTimeout());
-        MIOPEN_VALIDATE_LOCK(lock);
-        return reinterpret_cast<Derived*>(this)->LoadUnsafe(args...);
-    }
-
-    std::string filename;
-    std::string arch;
-    size_t num_cu;
-    LockFile& lock_file;
-    bool dbInvalid;
-    SQLite sql;
-};
-
-template <typename Derived>
-Derived& SQLiteBase<Derived>::GetCached(const std::string& path,
-                                        bool is_system,
-                                        const std::string& arch,
-                                        const size_t num_cu)
-{
-    static std::mutex mutex;
-    static const std::lock_guard<std::mutex> lock{mutex};
-
-    static auto instances = std::map<std::string, Derived*>{};
-    const auto it         = instances.find(path);
-
-    if(it != instances.end())
-        return *(it->second);
-
-    instances.emplace(path, new Derived{path, is_system, arch, num_cu}); // NOLINT
-    return *(instances.at(path));
-}
-
-class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
-{
-    public:
-    static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0";
-    SQLitePerfDb(const std::string& filename_,
-                 bool is_system,
-                 const std::string& arch_,
-                 std::size_t num_cu_);
-
-    template <class T>
-    inline void InsertConfig(const T& prob_desc)
-    {
-        std::string clause;
-        std::vector<std::string> vals;
-        std::tie(clause, vals) = prob_desc.InsertQuery();
-        auto stmt = SQLite::Statement{sql, clause, vals};
-        auto rc   = stmt.Step(sql);
-        if(rc != SQLITE_DONE)
-            MIOPEN_THROW(miopenStatusInternalError,
-                         "Failed to insert config: " + sql.ErrorMessage());
-        auto cnt = sql.Changes();
-        MIOPEN_LOG_I2(cnt << " rows updated");
-    }
-    template <class T>
-    inline std::string GetConfigIDs(const T& prob_desc)
-    {
-        std::string clause;
-        std::vector<std::string> vals;
-        std::tie(clause, vals) = prob_desc.WhereClause();
-        auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );";
-        auto stmt  = SQLite::Statement{sql, query, vals};
-        while(true)
-        {
-            auto rc = stmt.Step(sql);
-            if(rc == SQLITE_ROW)
-                return stmt.ColumnText(0);
-            else if(rc == SQLITE_DONE)
-                return "";
-            else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
-                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
-        }
-    }
-    template <typename T>
-    inline boost::optional<DbRecord> FindRecordUnsafe(const T& problem_config)
-    {
-        if(dbInvalid)
-            return boost::none;
-        std::string clause;
-        std::vector<std::string> values;
-        std::tie(clause, values) = problem_config.WhereClause();
-        // clang-format off
-        auto select_query =
-            "SELECT solver, params "
-            "FROM perf_db "
-            "INNER JOIN " + problem_config.table_name() + " " 
-            "ON perf_db.config = " + problem_config.table_name() +".id "
-            "WHERE "
-            "( " + clause + " )"
-            "AND (arch = '" + arch + "' ) "
-            "AND (num_cu = '" + std::to_string(num_cu) + "');";
-        // clang-format on
-        auto stmt = SQLite::Statement{sql, select_query, values};
-        DbRecord rec;
-        while(true)
-        {
-            auto rc = stmt.Step(sql);
-            if(rc == SQLITE_ROW)
-                rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1));
-            else if(rc == SQLITE_DONE)
-                break;
-            else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
-                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
-        }
-        if(rec.GetSize() == 0)
-            return boost::none;
-        else
-            return boost::optional<DbRecord>(rec);
-    }
-
-    /// Removes ID with associated VALUES from record with key PROBLEM_CONFIG from db.
-    ///
-    /// Returns true if remove was successful. Returns false if this PROBLEM_CONFIG or ID was not
-    /// found.
-    template <class T>
-    inline bool RemoveUnsafe(const T& problem_config, const std::string& id)
-    {
-        if(dbInvalid)
-            return false;
-        std::string clause;
-        std::vector<std::string> values;
-        std::tie(clause, values) = problem_config.WhereClause();
-        // clang-format off
-        auto query = 
-            "DELETE FROM perf_db "
-            "WHERE config IN ("
-            "SELECT id FROM config WHERE ( "
-            + clause + " ) )"
-            "AND solver == '" +  id + "' ;";
-        // clang-format on
-        auto stmt = SQLite::Statement{sql, query, values};
-        auto rc   = stmt.Step(sql);
-        if(rc == SQLITE_DONE)
-            return true;
-        else
-        {
-            std::string msg = "Unable to remove database entry: ";
-            MIOPEN_LOG_E(msg + sql.ErrorMessage());
-            return false;
-        }
-    }
-
-    /// Updates record under key PROBLEM_CONFIG with data ID:VALUES in database.
-    /// Returns updated record or boost::none if insertion failed
-    template <class T, class V>
-    inline boost::optional<DbRecord>
-    UpdateUnsafe(const T& problem_config, const std::string& id, const V& values)
-    {
-        if(dbInvalid)
-            return boost::none;
-        // UPSERT the value
-        {
-            std::string clause;
-            std::vector<std::string> vals;
-            std::tie(clause, vals) = problem_config.InsertQuery();
-            auto stmt = SQLite::Statement{sql, clause, vals};
-            auto rc   = stmt.Step(sql);
-            if(rc != SQLITE_DONE)
-                MIOPEN_THROW(miopenStatusInternalError,
-                             "Failed to insert config: " + sql.ErrorMessage());
-            auto cnt = sql.Changes();
-            MIOPEN_LOG_I2(cnt << " rows updated");
-        }
-
-        // UPSERT perf values
-        {
-            std::ostringstream params;
-            values.Serialize(params);
-            std::string clause;
-            std::vector<std::string> vals;
-            std::tie(clause, vals) = problem_config.WhereClause();
-
-            // clang-format off
-            std::string query =
-                "INSERT OR REPLACE INTO "
-                "perf_db(config, solver, params, arch, num_cu) "
-                "VALUES("
-                "(SELECT id FROM " + problem_config.table_name() +  " "
-                "WHERE ( " + clause + " ) ) , ? , ? , ? , ?);";
-            // clang-format on
-            vals.push_back(id);
-            vals.push_back(params.str());
-            vals.push_back(arch);
-            vals.push_back(std::to_string(num_cu));
-            auto stmt = SQLite::Statement{sql, query, vals};
-            auto rc   = stmt.Step(sql);
-            if(rc != SQLITE_DONE)
-            {
-                MIOPEN_LOG_E("Failed to insert performance record in the database: " +
-                             sql.ErrorMessage());
-                return boost::none;
-            }
-        }
-        DbRecord record;
-        record.SetValues(id, values);
-        return record;
-    }
-
-    template <class T, class V>
-    inline bool StoreRecordUnsafe(const T& problem_config, const std::string& id, const V& values)
-    {
-        if(dbInvalid)
-            return false;
-        return bool(UpdateUnsafe(problem_config, id, values));
-    }
-
-    /**
-     * clears both the config and the associated solver values from the database
-     */
-    template <class T>
-    inline bool ClearRecordUnsafe(const T& problem_config)
-    {
-        if(dbInvalid)
-            return true;
-        std::string clause;
-        std::vector<std::string> values;
-        std::tie(clause, values) = problem_config.WhereClause();
-        // clang-format off
-        auto query = 
-            "DELETE FROM perf_db "
-            "WHERE config IN ("
-            "SELECT id FROM config WHERE ( "
-            + clause + " ))";
-        // clang-format on
-        auto stmt = SQLite::Statement{sql, query, values};
-        auto rc   = stmt.Step(sql);
-        if(rc != SQLITE_DONE)
-        {
-            MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage());
-            return false;
-        }
-        else
-            return true;
-    }
-
-    /// Searches for record with key PROBLEM_CONFIG and gets VALUES under the ID from it.
-    /// Class T should have "void Serialize(PDAttr_t&) const" member function available.
-    /// Class V shall have "bool Deserialize(const std::string& str)" member function available.
-    ///
-    /// Returns false if the problem config is not found in the config table or if there are perf
-    /// parameters in the perf_db table
-    template <class T, class V>
-    inline bool LoadUnsafe(const T& problem_config, const std::string& id, V& values)
-    {
-        if(dbInvalid)
-            return false;
-        const auto record = FindRecordUnsafe(problem_config);
-
-        if(!record)
-            return false;
-        return record->GetValues(id, values);
-    }
-};
-} // namespace miopen

From 4c997ea81ec26120e71fcfca963d86168c916872 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 10:28:34 +0800
Subject: [PATCH 08/18] merge sqlite hpp 2

---
 src/include/miopen/sqlite_db.hpp | 596 +++++++++++++++++++++++++++++++
 1 file changed, 596 insertions(+)

diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp
index e69de29bb2..76bc3597a1 100755
--- a/src/include/miopen/sqlite_db.hpp
+++ b/src/include/miopen/sqlite_db.hpp
@@ -0,0 +1,596 @@
+/*******************************************************************************
+*
+* MIT License
+*
+* Copyright (c) 2019 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+#pragma once
+
+#include <miopen/db_record.hpp>
+#include <miopen/manage_ptr.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/lock_file.hpp>
+
+#include <boost/core/explicit_operator_bool.hpp>
+#include <boost/none.hpp>
+#include <boost/optional/optional.hpp>
+#include <boost/thread.hpp>
+#include <boost/thread/thread_time.hpp>
+#include "sqlite3.h"
+#include <mutex>
+#include <thread>
+
+#include <string>
+#include <chrono>
+#include <unordered_map>
+
+namespace boost {
+namespace filesystem {
+class path;
+} // namespace filesystem
+} // namespace boost
+
+namespace miopen {
+
+#define MIOPEN_VALIDATE_LOCK(lock)                       \
+    do                                                   \
+    {                                                    \
+        if(!(lock))                                      \
+            MIOPEN_THROW("Db lock has failed to lock."); \
+    } while(false)
+
+template <class Derived>
+struct SQLiteSerializable
+{
+    std::vector<std::string> FieldNames() const
+    {
+        std::vector<std::string> names;
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const std::string& value, const std::string& name) {
+                           std::ignore = value;
+                           names.push_back(name);
+                       });
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const int value, const std::string name) {
+                           std::ignore = value;
+                           names.push_back(name);
+                       });
+
+        return names;
+    }
+    std::tuple<std::string, std::vector<std::string>> WhereClause() const
+    {
+        std::vector<std::string> values;
+        std::vector<std::string> clauses;
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const std::string& value, const std::string& name) {
+                           clauses.push_back("(" + name + " = ? )");
+                           values.push_back(value);
+                       });
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const int value, const std::string name) {
+                           clauses.push_back("(" + name + " = ? )");
+                           values.push_back(std::to_string(value));
+                       });
+        std::string clause = JoinStrings(clauses, " AND ");
+        return std::make_tuple(clause, values);
+    }
+    std::tuple<std::string, std::vector<std::string>> InsertQuery() const
+    {
+        std::vector<std::string> int_names, str_names, values;
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const std::string& value, const std::string& name) {
+                           str_names.push_back(name);
+                           values.push_back(value);
+                       });
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const int value, const std::string name) {
+                           int_names.push_back(name);
+                           values.push_back(std::to_string(value));
+                       });
+        std::vector<std::string> tokens((values.size()), "?");
+        ;
+
+        std::string q = "INSERT OR IGNORE INTO " + Derived::table_name() + "( " +
+                        JoinStrings(str_names, ",") + "," + JoinStrings(int_names, ",") +
+                        " ) VALUES( " + JoinStrings(tokens, ",") + ");";
+        return std::make_tuple(q, values);
+    }
+    std::tuple<std::string, std::vector<std::string>> SelectQuery() const
+    {
+        std::string clauses;
+        std::vector<std::string> values;
+        std::tie(clauses, values) = WhereClause();
+        std::string query = "SELECT id FROM " + Derived::table_name() + " WHERE " + clauses + ";";
+        return std::make_tuple(query, values);
+    }
+
+    std::string CreateQuery() const
+    {
+        std::vector<std::string> str_fields;
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const std::string value, const std::string name) {
+                           std::ignore = value;
+                           str_fields.push_back(name);
+                       });
+        std::vector<std::string> int_fields;
+        Derived::Visit(static_cast<const Derived&>(*this),
+                       [&](const int value, const std::string name) {
+                           std::ignore = value;
+                           int_fields.push_back(name);
+                       });
+        std::ostringstream ss;
+        ss << "CREATE TABLE IF NOT EXISTS `" << Derived::table_name() << "` ("
+           << "`id` INTEGER PRIMARY KEY ASC";
+        for(auto& el : str_fields)
+            ss << ",`" << el << "` TEXT NOT NULL";
+        for(auto& el : int_fields)
+            ss << ",`" << el << "` INT NOT NULL";
+        ss << ");";
+        ss << "CREATE UNIQUE INDEX IF NOT EXISTS "
+           << "`idx_" << Derived::table_name() << "` "
+           << "ON " << Derived::table_name() << "( " << miopen::JoinStrings(str_fields, ",") << ", "
+           << miopen::JoinStrings(int_fields, ",") << " );";
+        return ss.str();
+    }
+};
+
+class SQLite
+{
+    class impl;
+    // do we need propagate const
+    std::unique_ptr<impl> pImpl;
+
+    public:
+    class Statement
+    {
+        class impl;
+        std::unique_ptr<impl> pImpl;
+
+        public:
+        Statement(const SQLite& sql, const std::string& query);
+        Statement(const SQLite& sql,
+                  const std::string& query,
+                  const std::vector<std::string>& vals);
+        Statement();
+        ~Statement();
+        Statement(Statement&&) noexcept;
+        Statement& operator=(Statement&&) noexcept;
+        Statement& operator=(const Statement&) = delete;
+        int Step(const SQLite& sql);
+        std::string ColumnText(int idx);
+        std::string ColumnBlob(int idx);
+        int64_t ColumnInt64(int idx);
+        int BindText(int idx, const std::string& txt);
+        int BindBlob(int idx, const std::string& blob);
+        int BindInt64(int idx, int64_t);
+    };
+
+    using result_type = std::vector<std::unordered_map<std::string, std::string>>;
+    SQLite();
+    SQLite(const std::string& filename_, bool is_system);
+    ~SQLite();
+    SQLite(SQLite&&) noexcept;
+    SQLite& operator=(SQLite&&) noexcept;
+    SQLite& operator=(const SQLite&) = delete;
+    bool Valid() const;
+    result_type Exec(const std::string& query) const;
+    int Changes() const;
+    int Retry(std::function<int()>) const;
+    static int Retry(std::function<int()> f, std::string filename);
+    std::string ErrorMessage() const;
+};
+
+template <typename Derived>
+class SQLiteBase
+{
+    protected:
+    using exclusive_lock = boost::unique_lock<LockFile>;
+    using shared_lock    = boost::shared_lock<LockFile>;
+    static boost::system_time GetLockTimeout()
+    {
+        return boost::get_system_time() + boost::posix_time::milliseconds(60000);
+    }
+
+    public:
+    SQLiteBase(const std::string& filename_,
+               bool is_system,
+               const std::string& arch_,
+               std::size_t num_cu_)
+        : filename(filename_),
+          arch(arch_),
+          num_cu(num_cu_),
+          lock_file(LockFile::Get(LockFilePath(filename_).c_str()))
+    {
+        MIOPEN_LOG_I2("Initializing " << (is_system ? "system" : "user") << " database file "
+                                      << filename);
+
+        if(filename.empty())
+        {
+            dbInvalid = true;
+            return;
+        }
+
+        if(!is_system && !filename.empty())
+        {
+            auto file            = boost::filesystem::path(filename_);
+            const auto directory = file.remove_filename();
+            if(directory.string().empty())
+            {
+                dbInvalid = true;
+                return;
+            }
+
+            if(!(boost::filesystem::exists(directory)))
+            {
+                if(!boost::filesystem::create_directories(directory))
+                    MIOPEN_LOG_W("Unable to create a directory: " << directory);
+                else
+                    boost::filesystem::permissions(directory, boost::filesystem::all_all);
+            }
+        }
+        sql = std::move(SQLite{filename_, is_system});
+        if(!sql.Valid())
+        {
+            dbInvalid = true;
+            if(!is_system)
+                MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_);
+            else
+                MIOPEN_LOG_W("Unable to read system database file:" + filename_ +
+                             " Performance may degrade");
+        }
+        else
+            dbInvalid = false;
+    }
+
+    static Derived&
+    GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu);
+    // TODO: Fix this for the overhead of having fields per record
+
+    inline auto CheckTableColumns(const std::string& tableName,
+                                  const std::vector<std::string>& goldenList) const
+    {
+        const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");";
+        SQLite::result_type cfg_res;
+        {
+            const auto lock = shared_lock(lock_file, GetLockTimeout());
+            MIOPEN_VALIDATE_LOCK(lock);
+            cfg_res = sql.Exec(sql_cfg_fds);
+        }
+        std::vector<std::string> cfg_fds(cfg_res.size());
+        std::transform(
+            cfg_res.begin(), cfg_res.end(), cfg_fds.begin(), [](auto row) { return row["name"]; });
+        // search in the golden vector
+        bool AllFound = true;
+        for(auto& goldenName : goldenList)
+        {
+            if(std::find(cfg_fds.begin(), cfg_fds.end(), goldenName) == cfg_fds.end())
+            {
+                AllFound = false;
+                std::ostringstream ss;
+                ss << "Field " << goldenName << " not found in table: " << tableName;
+                MIOPEN_LOG_I2(ss.str());
+                // break; Not breaking to enable logging of all missing fields.
+            }
+        }
+        return AllFound;
+    }
+
+    template <typename... U>
+    inline auto FindRecord(U&... args)
+    {
+        const auto lock = shared_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->FindRecordUnsafe(args...);
+    }
+
+    template <typename... U>
+    inline auto RemoveRecord(U&... args)
+    {
+        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->RemoveRecordUnsafe(args...);
+    }
+
+    template <typename... U>
+    inline auto StoreRecord(U&... args)
+    {
+        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->StoreRecordUnsafe(args...);
+    }
+
+    template <typename... U>
+    inline auto Remove(const U&... args)
+    {
+        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->RemoveUnsafe(args...);
+    }
+
+    template <typename... U>
+    inline auto Update(const U&... args)
+    {
+        const auto lock = exclusive_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->UpdateUnsafe(args...);
+    }
+
+    template <typename... U>
+    inline auto Load(U&&... args)
+    {
+        const auto lock = shared_lock(lock_file, GetLockTimeout());
+        MIOPEN_VALIDATE_LOCK(lock);
+        return reinterpret_cast<Derived*>(this)->LoadUnsafe(args...);
+    }
+
+    std::string filename;
+    std::string arch;
+    size_t num_cu;
+    LockFile& lock_file;
+    bool dbInvalid;
+    SQLite sql;
+};
+
+template <typename Derived>
+Derived& SQLiteBase<Derived>::GetCached(const std::string& path,
+                                        bool is_system,
+                                        const std::string& arch,
+                                        const size_t num_cu)
+{
+    static std::mutex mutex;
+    static const std::lock_guard<std::mutex> lock{mutex};
+
+    static auto instances = std::map<std::string, Derived*>{};
+    const auto it         = instances.find(path);
+
+    if(it != instances.end())
+        return *(it->second);
+
+    instances.emplace(path, new Derived{path, is_system, arch, num_cu}); // NOLINT
+    return *(instances.at(path));
+}
+
+class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
+{
+    public:
+    static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0";
+    SQLitePerfDb(const std::string& filename_,
+                 bool is_system,
+                 const std::string& arch_,
+                 std::size_t num_cu_);
+
+    template <class T>
+    inline void InsertConfig(const T& prob_desc)
+    {
+        std::string clause;
+        std::vector<std::string> vals;
+        std::tie(clause, vals) = prob_desc.InsertQuery();
+        auto stmt = SQLite::Statement{sql, clause, vals};
+        auto rc   = stmt.Step(sql);
+        if(rc != SQLITE_DONE)
+            MIOPEN_THROW(miopenStatusInternalError,
+                         "Failed to insert config: " + sql.ErrorMessage());
+        auto cnt = sql.Changes();
+        MIOPEN_LOG_I2(cnt << " rows updated");
+    }
+    template <class T>
+    inline std::string GetConfigIDs(const T& prob_desc)
+    {
+        std::string clause;
+        std::vector<std::string> vals;
+        std::tie(clause, vals) = prob_desc.WhereClause();
+        auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );";
+        auto stmt  = SQLite::Statement{sql, query, vals};
+        while(true)
+        {
+            auto rc = stmt.Step(sql);
+            if(rc == SQLITE_ROW)
+                return stmt.ColumnText(0);
+            else if(rc == SQLITE_DONE)
+                return "";
+            else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
+                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
+        }
+    }
+    template <typename T>
+    inline boost::optional<DbRecord> FindRecordUnsafe(const T& problem_config)
+    {
+        if(dbInvalid)
+            return boost::none;
+        std::string clause;
+        std::vector<std::string> values;
+        std::tie(clause, values) = problem_config.WhereClause();
+        // clang-format off
+        auto select_query =
+            "SELECT solver, params "
+            "FROM perf_db "
+            "INNER JOIN " + problem_config.table_name() + " " 
+            "ON perf_db.config = " + problem_config.table_name() +".id "
+            "WHERE "
+            "( " + clause + " )"
+            "AND (arch = '" + arch + "' ) "
+            "AND (num_cu = '" + std::to_string(num_cu) + "');";
+        // clang-format on
+        auto stmt = SQLite::Statement{sql, select_query, values};
+        DbRecord rec;
+        while(true)
+        {
+            auto rc = stmt.Step(sql);
+            if(rc == SQLITE_ROW)
+                rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1));
+            else if(rc == SQLITE_DONE)
+                break;
+            else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE)
+                MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage());
+        }
+        if(rec.GetSize() == 0)
+            return boost::none;
+        else
+            return boost::optional<DbRecord>(rec);
+    }
+
+    /// Removes ID with associated VALUES from record with key PROBLEM_CONFIG from db.
+    ///
+    /// Returns true if remove was successful. Returns false if this PROBLEM_CONFIG or ID was not
+    /// found.
+    template <class T>
+    inline bool RemoveUnsafe(const T& problem_config, const std::string& id)
+    {
+        if(dbInvalid)
+            return false;
+        std::string clause;
+        std::vector<std::string> values;
+        std::tie(clause, values) = problem_config.WhereClause();
+        // clang-format off
+        auto query = 
+            "DELETE FROM perf_db "
+            "WHERE config IN ("
+            "SELECT id FROM config WHERE ( "
+            + clause + " ) )"
+            "AND solver == '" +  id + "' ;";
+        // clang-format on
+        auto stmt = SQLite::Statement{sql, query, values};
+        auto rc   = stmt.Step(sql);
+        if(rc == SQLITE_DONE)
+            return true;
+        else
+        {
+            std::string msg = "Unable to remove database entry: ";
+            MIOPEN_LOG_E(msg + sql.ErrorMessage());
+            return false;
+        }
+    }
+
+    /// Updates record under key PROBLEM_CONFIG with data ID:VALUES in database.
+    /// Returns updated record or boost::none if insertion failed
+    template <class T, class V>
+    inline boost::optional<DbRecord>
+    UpdateUnsafe(const T& problem_config, const std::string& id, const V& values)
+    {
+        if(dbInvalid)
+            return boost::none;
+        // UPSERT the value
+        {
+            std::string clause;
+            std::vector<std::string> vals;
+            std::tie(clause, vals) = problem_config.InsertQuery();
+            auto stmt = SQLite::Statement{sql, clause, vals};
+            auto rc   = stmt.Step(sql);
+            if(rc != SQLITE_DONE)
+                MIOPEN_THROW(miopenStatusInternalError,
+                             "Failed to insert config: " + sql.ErrorMessage());
+            auto cnt = sql.Changes();
+            MIOPEN_LOG_I2(cnt << " rows updated");
+        }
+
+        // UPSERT perf values
+        {
+            std::ostringstream params;
+            values.Serialize(params);
+            std::string clause;
+            std::vector<std::string> vals;
+            std::tie(clause, vals) = problem_config.WhereClause();
+
+            // clang-format off
+            std::string query =
+                "INSERT OR REPLACE INTO "
+                "perf_db(config, solver, params, arch, num_cu) "
+                "VALUES("
+                "(SELECT id FROM " + problem_config.table_name() +  " "
+                "WHERE ( " + clause + " ) ) , ? , ? , ? , ?);";
+            // clang-format on
+            vals.push_back(id);
+            vals.push_back(params.str());
+            vals.push_back(arch);
+            vals.push_back(std::to_string(num_cu));
+            auto stmt = SQLite::Statement{sql, query, vals};
+            auto rc   = stmt.Step(sql);
+            if(rc != SQLITE_DONE)
+            {
+                MIOPEN_LOG_E("Failed to insert performance record in the database: " +
+                             sql.ErrorMessage());
+                return boost::none;
+            }
+        }
+        DbRecord record;
+        record.SetValues(id, values);
+        return record;
+    }
+
+    template <class T, class V>
+    inline bool StoreRecordUnsafe(const T& problem_config, const std::string& id, const V& values)
+    {
+        if(dbInvalid)
+            return false;
+        return bool(UpdateUnsafe(problem_config, id, values));
+    }
+
+    /**
+     * clears both the config and the associated solver values from the database
+     */
+    template <class T>
+    inline bool ClearRecordUnsafe(const T& problem_config)
+    {
+        if(dbInvalid)
+            return true;
+        std::string clause;
+        std::vector<std::string> values;
+        std::tie(clause, values) = problem_config.WhereClause();
+        // clang-format off
+        auto query = 
+            "DELETE FROM perf_db "
+            "WHERE config IN ("
+            "SELECT id FROM config WHERE ( "
+            + clause + " ))";
+        // clang-format on
+        auto stmt = SQLite::Statement{sql, query, values};
+        auto rc   = stmt.Step(sql);
+        if(rc != SQLITE_DONE)
+        {
+            MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage());
+            return false;
+        }
+        else
+            return true;
+    }
+
+    /// Searches for record with key PROBLEM_CONFIG and gets VALUES under the ID from it.
+    /// Class T should have "void Serialize(PDAttr_t&) const" member function available.
+    /// Class V shall have "bool Deserialize(const std::string& str)" member function available.
+    ///
+    /// Returns false if the problem config is not found in the config table or if there are perf
+    /// parameters in the perf_db table
+    template <class T, class V>
+    inline bool LoadUnsafe(const T& problem_config, const std::string& id, V& values)
+    {
+        if(dbInvalid)
+            return false;
+        const auto record = FindRecordUnsafe(problem_config);
+
+        if(!record)
+            return false;
+        return record->GetValues(id, values);
+    }
+};
+} // namespace miopen
\ No newline at end of file

From ab73c1caeb939c052983ff81f14757bb91e33f64 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 11:32:54 +0800
Subject: [PATCH 09/18] merge sqlitedb.hpp

---
 src/include/miopen/sqlite_db.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp
index 76bc3597a1..3c80254087 100755
--- a/src/include/miopen/sqlite_db.hpp
+++ b/src/include/miopen/sqlite_db.hpp
@@ -593,4 +593,4 @@ class SQLitePerfDb : public SQLiteBase<SQLitePerfDb>
         return record->GetValues(id, values);
     }
 };
-} // namespace miopen
\ No newline at end of file
+} // namespace miopen

From fcf342c658a052cfe88574f31574a18aa1bd175d Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 6 May 2020 21:06:24 +0800
Subject: [PATCH 10/18] add v4r1 bwd xdlops kernel in invoker

---
 src/conv/invokers/impl_gemm.cpp                            | 1 +
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/conv/invokers/impl_gemm.cpp b/src/conv/invokers/impl_gemm.cpp
index 4b2ddc8ddc..412dd342ef 100644
--- a/src/conv/invokers/impl_gemm.cpp
+++ b/src/conv/invokers/impl_gemm.cpp
@@ -143,6 +143,7 @@ InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx)
                 // clang-format off
                 else if(
                     kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" ||
+                    kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw" ||
                     kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw")
                 // clang-format on
                 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index ae79d911f5..4a4777e9d5 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -216,7 +216,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext&
                                                  1,
                                                  GemmBBlockCopyThreadSliceLengths_GemmN,
                                                  GemmABlockCopyThreadSliceLengths_GemmM,
-                                                 1);
+                                                 GetEPackLength(ctx, true));
     return lds_size <= 64 * 1024;
 }
 

From f148d26a321713d21c4992a3fd6fc966ff944e04 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Thu, 7 May 2020 09:44:05 +0800
Subject: [PATCH 11/18] delete dead code in igemm bwd xdlops solver

---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 4a4777e9d5..364ff9f04a 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -510,15 +510,8 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig(
 PerformanceImplicitGemmBwdDataV4R1Xdlops
 ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx) const
 {
-
     // \todo add fp16 and bfp16 kernels
     return GenericSearchBwd(*this, ctx);
-
-    // fp16/bfp16 uses fp32 workspace to leverage fp32 atomic add
-    // if(ctx.IsFp16() || ctx.IsBfp16())
-    //     return GenericSearchBwd(*this, ctx, SearchTweak::WorkspaceInsteadOfXBuffer);
-    // else
-    //     return GenericSearchBwd(*this, ctx);
 }
 
 int ConvHipImplicitGemmBwdDataV4R1Xdlops::RunAndMeasureSolution(miopen::Handle& profile_h,

From 9edd884add74bf5f5805da9124ee2101c238a33c Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Thu, 7 May 2020 10:53:04 +0800
Subject: [PATCH 12/18] update license information for igemm bwd xdlops solver

---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 364ff9f04a..1ee8e717cc 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

From c7b28ba86559af82a1c38e24cac68be2a399ce3c Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Thu, 7 May 2020 11:13:37 +0800
Subject: [PATCH 13/18] rename vars: use GemmA/B instead of In/WeiBlock

---
 src/include/miopen/solver.hpp                 | 16 ++--
 ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 77 +++++++++----------
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
index a317638bf1..be62dffe12 100755
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -678,11 +678,11 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
     int GemmMPerWave;
     int GemmNPerWave;
 
-    int InBlockCopyClusterLengths_GemmK; // 2^n[4..16]
-    int InBlockCopyClusterLengths_GemmN; // 2^n[8..64]
+    int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16]
+    int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64]
 
-    int WeiBlockCopyClusterLengths_GemmK; // 2^n[1..4]
-    int WeiBlockCopyClusterLengths_GemmM; // 2^n[16..128]
+    int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4]
+    int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128]
 
     bool use_spare_set;
 
@@ -711,10 +711,10 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
         f(self.GemmKPerBlock, "GemmKPerBlock");
         f(self.GemmMPerWave, "GemmMPerWave");
         f(self.GemmNPerWave, "GemmNPerWave");
-        f(self.InBlockCopyClusterLengths_GemmK, "InBlockCopyClusterLengths_GemmK");
-        f(self.InBlockCopyClusterLengths_GemmN, "InBlockCopyClusterLengths_GemmN");
-        f(self.WeiBlockCopyClusterLengths_GemmK, "WeiBlockCopyClusterLengths_GemmK");
-        f(self.WeiBlockCopyClusterLengths_GemmM, "WeiBlockCopyClusterLengths_GemmM");
+        f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK");
+        f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN");
+        f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK");
+        f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM");
     }
 
     std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 1ee8e717cc..03e0041a91 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -79,7 +79,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara
             SrcDataPerRead_GemmM = 1;
 
         // calculate threadwise copy size
-        const auto a_data_per_thread_copy = GemmMPerBlock / WeiBlockCopyClusterLengths_GemmM;
+        const auto a_data_per_thread_copy = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
 
         if(!(a_data_per_thread_copy > 0))
             MIOPEN_THROW("invalid performance parameter");
@@ -127,7 +127,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara
         }
 
         // calculate threadwise copy size
-        int b_data_per_thread_copy = GemmNPerBlock / InBlockCopyClusterLengths_GemmN;
+        int b_data_per_thread_copy = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
 
         if(!(b_data_per_thread_copy > 0))
             MIOPEN_THROW("invalid performance parameter");
@@ -166,11 +166,6 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext&
             return false; // wrong! cannot divice N evenly among thread
     }
 
-    const auto& GemmBBlockCopyClusterLengths_GemmK = InBlockCopyClusterLengths_GemmK;
-    const auto& GemmBBlockCopyClusterLengths_GemmN = InBlockCopyClusterLengths_GemmN;
-    const auto& GemmABlockCopyClusterLengths_GemmK = WeiBlockCopyClusterLengths_GemmK;
-    const auto& GemmABlockCopyClusterLengths_GemmM = WeiBlockCopyClusterLengths_GemmM;
-
     if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 &&
          GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 &&
          GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 &&
@@ -229,35 +224,35 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo
     GemmMPerWave = spare ? 4 : 64;
     GemmNPerWave = spare ? 16 : 64;
 
-    InBlockCopyClusterLengths_GemmK = 4;
-    InBlockCopyClusterLengths_GemmN = 4;
+    GemmBBlockCopyClusterLengths_GemmK = 4;
+    GemmBBlockCopyClusterLengths_GemmN = 4;
 
-    WeiBlockCopyClusterLengths_GemmK = 2;
-    WeiBlockCopyClusterLengths_GemmM = 4;
+    GemmABlockCopyClusterLengths_GemmK = 2;
+    GemmABlockCopyClusterLengths_GemmM = 4;
 
     use_spare_set = spare;
 }
 
 PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(
-    int BPerBlock_,
-    int KPerBlock_,
-    int EPerBlock_,
+    int GemmNPerBlock_,
+    int GemmMPerBlock_,
+    int GemmKPerBlock_,
     int GemmMPerWave_,
     int GemmNPerWave_,
-    int InBlockCopyClusterLengths_E_,
-    int InBlockCopyClusterLengths_B_,
-    int WeiBlockCopyClusterLengths_E_,
-    int WeiBlockCopyClusterLengths_K_,
+    int GemmBBlockCopyClusterLengths_GemmK_,
+    int GemmBBlockCopyClusterLengths_GemmN_,
+    int GemmABlockCopyClusterLengths_GemmK_,
+    int GemmABlockCopyClusterLengths_GemmM_,
     bool use_spare_set_)
-    : GemmNPerBlock(BPerBlock_),
-      GemmMPerBlock(KPerBlock_),
-      GemmKPerBlock(EPerBlock_),
+    : GemmNPerBlock(GemmNPerBlock_),
+      GemmMPerBlock(GemmMPerBlock_),
+      GemmKPerBlock(GemmKPerBlock_),
       GemmMPerWave(GemmMPerWave_),
       GemmNPerWave(GemmNPerWave_),
-      InBlockCopyClusterLengths_GemmK(InBlockCopyClusterLengths_E_),
-      InBlockCopyClusterLengths_GemmN(InBlockCopyClusterLengths_B_),
-      WeiBlockCopyClusterLengths_GemmK(WeiBlockCopyClusterLengths_E_),
-      WeiBlockCopyClusterLengths_GemmM(WeiBlockCopyClusterLengths_K_),
+      GemmBBlockCopyClusterLengths_GemmK(GemmBBlockCopyClusterLengths_GemmK_),
+      GemmBBlockCopyClusterLengths_GemmN(GemmBBlockCopyClusterLengths_GemmN_),
+      GemmABlockCopyClusterLengths_GemmK(GemmABlockCopyClusterLengths_GemmK_),
+      GemmABlockCopyClusterLengths_GemmM(GemmABlockCopyClusterLengths_GemmM_),
       use_spare_set(use_spare_set_)
 {
 }
@@ -271,10 +266,10 @@ operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const
         && GemmKPerBlock == other.GemmKPerBlock
         && GemmMPerWave == other.GemmMPerWave
         && GemmNPerWave == other.GemmNPerWave
-        && InBlockCopyClusterLengths_GemmK == other.InBlockCopyClusterLengths_GemmK
-        && InBlockCopyClusterLengths_GemmN == other.InBlockCopyClusterLengths_GemmN
-        && WeiBlockCopyClusterLengths_GemmK == other.WeiBlockCopyClusterLengths_GemmK
-        && WeiBlockCopyClusterLengths_GemmM == other.WeiBlockCopyClusterLengths_GemmM
+        && GemmBBlockCopyClusterLengths_GemmK == other.GemmBBlockCopyClusterLengths_GemmK
+        && GemmBBlockCopyClusterLengths_GemmN == other.GemmBBlockCopyClusterLengths_GemmN
+        && GemmABlockCopyClusterLengths_GemmK == other.GemmABlockCopyClusterLengths_GemmK
+        && GemmABlockCopyClusterLengths_GemmM == other.GemmABlockCopyClusterLengths_GemmM
         && use_spare_set == other.use_spare_set;
     // clang-format on
 }
@@ -287,10 +282,10 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const
         && IsTwoPower<4,32>(GemmKPerBlock)
         && IsTwoPower<4,64>(GemmMPerWave)
         && IsTwoPower<16,64>(GemmNPerWave)
-        && IsTwoPower<4,16>(InBlockCopyClusterLengths_GemmK)
-        && IsTwoPower<4,64>(InBlockCopyClusterLengths_GemmN)
-        && IsTwoPower<2,16>(WeiBlockCopyClusterLengths_GemmK)
-        && IsTwoPower<4,128>(WeiBlockCopyClusterLengths_GemmM); // clang-format on
+        && IsTwoPower<4,16>(GemmBBlockCopyClusterLengths_GemmK)
+        && IsTwoPower<4,64>(GemmBBlockCopyClusterLengths_GemmN)
+        && IsTwoPower<2,16>(GemmABlockCopyClusterLengths_GemmK)
+        && IsTwoPower<4,128>(GemmABlockCopyClusterLengths_GemmM); // clang-format on
 }
 
 bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
@@ -319,13 +314,13 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
             if(!NextTwoPower<16, 64>(GemmNPerWave))
                 break;
         }
-        if(!NextTwoPower<4, 16>(InBlockCopyClusterLengths_GemmK))
+        if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK))
             break;
-        if(!NextTwoPower<4, 64>(InBlockCopyClusterLengths_GemmN))
+        if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN))
             break;
-        if(!NextTwoPower<2, 16>(WeiBlockCopyClusterLengths_GemmK))
+        if(!NextTwoPower<2, 16>(GemmABlockCopyClusterLengths_GemmK))
             break;
-        if(!NextTwoPower<4, 128>(WeiBlockCopyClusterLengths_GemmM))
+        if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM))
             break;
         return false;
     } while(false);
@@ -653,12 +648,12 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) +
                 std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) +
                 std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmK) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmM) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmM) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM     ) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.InBlockCopyClusterLengths_GemmK) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.InBlockCopyClusterLengths_GemmN) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmN) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN     ) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) +
                 std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +

From 39f7cc01f2248c56fcb7faab8e6093fe73913d6c Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Thu, 7 May 2020 14:58:45 +0800
Subject: [PATCH 14/18] remove threadwise gemm's inliine macro

---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 03e0041a91..114239fbdb 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -657,7 +657,6 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN     ) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) +
                 std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +
-                std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') +
                 std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_ADD=") + (support_amd_buffer_atomic_add(ctx) ? '1' : '0') +
                 std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) +
                 std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) +

From 4a1e51149be5e9cfc90baa732a2c5179f58f4cab Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Fri, 8 May 2020 16:05:18 +0800
Subject: [PATCH 15/18] remove cluster lengths from tunable params

---
 src/include/miopen/solver.hpp                 |  23 +-
 ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 263 +++++++++++-------
 2 files changed, 179 insertions(+), 107 deletions(-)
 mode change 100755 => 100644 src/include/miopen/solver.hpp
 mode change 100755 => 100644 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
old mode 100755
new mode 100644
index be62dffe12..650768f0b7
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -677,25 +677,24 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
 
     int GemmMPerWave;
     int GemmNPerWave;
-
+#if 0
     int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16]
     int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64]
 
     int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4]
     int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128]
-
+#endif
     bool use_spare_set;
 
-    PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, int, int, int, int, bool);
+    PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, bool);
 
     PerformanceImplicitGemmBwdDataV4R1Xdlops()
-        : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, -1, -1, -1, -1, false)
+        : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, false)
     {
     }
 
-    PerformanceImplicitGemmBwdDataV4R1Xdlops(
-        int a, int b, int c, int d, int e, int f, int g, int h, int i)
-        : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, f, g, h, i, false)
+    PerformanceImplicitGemmBwdDataV4R1Xdlops(int a, int b, int c, int d, int e)
+        : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, false)
     {
     }
 
@@ -711,20 +710,20 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
         f(self.GemmKPerBlock, "GemmKPerBlock");
         f(self.GemmMPerWave, "GemmMPerWave");
         f(self.GemmNPerWave, "GemmNPerWave");
+#if 0
         f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK");
         f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN");
         f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK");
         f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM");
+#endif
     }
 
     std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
-    std::tuple<int, int, bool>
-    CalculateBlockGemmPerformanceParameters(const ConvolutionContext& ctx) const;
-    std::tuple<int, int, bool>
+    std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, int, int, bool>
     CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
-    std::tuple<int, int, bool>
+    std::tuple<int, int, int, int, bool>
     CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
-    // std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
     bool IsValidValue() const;
     bool IsValid(const ConvolutionContext& ctx) const;
     void EuristicInit(const ConvolutionContext& ctx);
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
old mode 100755
new mode 100644
index 114239fbdb..0273e418b5
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -59,15 +59,21 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGridSize(const ConvolutionCon
     return std::make_tuple(GridSize, true);
 }
 
-std::tuple<int, int, bool>
+std::tuple<int, int, int, int, bool>
 PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformanceParameters(
     const ConvolutionContext& ctx) const
 {
+    int ClusterLengths_GemmK  = 0;
+    int ClusterLengths_GemmM  = 0;
     int SrcDataPerRead_GemmM  = amd_buffer_load_max_length<float>();
     int DstDataPerWrite_GemmM = amd_lds_write_max_length<float>();
 
     try
     {
+        const auto WaveSize = 64;
+        const auto BlockSize =
+            GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize;
+
         // calculate vector length on gemmk dimension
         SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, GemmMPerBlock);
 
@@ -79,7 +85,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara
             SrcDataPerRead_GemmM = 1;
 
         // calculate threadwise copy size
-        const auto a_data_per_thread_copy = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
+        const auto a_data_per_thread_copy = (GemmKPerBlock * GemmMPerBlock) / BlockSize;
 
         if(!(a_data_per_thread_copy > 0))
             MIOPEN_THROW("invalid performance parameter");
@@ -87,26 +93,48 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara
         // GemmABlockCopySrcDataPerRead_GemmK also bounded by size of threadwise copy
         SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, a_data_per_thread_copy);
 
+        // decide threadwise copy lengths
+        const auto a_data_per_thread_copy_gemmm = SrcDataPerRead_GemmM;
+        const auto a_data_per_thread_copy_gemmk =
+            a_data_per_thread_copy / a_data_per_thread_copy_gemmm;
+
         // GemmABlockCopyDstDataPerWrite_GemmM also bounded by size of threadwise copy
-        DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, SrcDataPerRead_GemmM);
+        DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, a_data_per_thread_copy_gemmm);
+
+        // calculate blockwise copy thread cluster lengths
+        ClusterLengths_GemmK = GemmKPerBlock / a_data_per_thread_copy_gemmk;
+        ClusterLengths_GemmM = GemmMPerBlock / a_data_per_thread_copy_gemmm;
+
+        if(!(ClusterLengths_GemmK > 0 && ClusterLengths_GemmM > 0))
+            MIOPEN_THROW("invalid performance parameter");
     }
     catch(...)
     {
-        return std::make_tuple(-1, -1, false);
+        return std::make_tuple(-1, -1, -1, -1, false);
     }
 
-    return std::make_tuple(SrcDataPerRead_GemmM, DstDataPerWrite_GemmM, true);
+    return std::make_tuple(ClusterLengths_GemmK,
+                           ClusterLengths_GemmM,
+                           SrcDataPerRead_GemmM,
+                           DstDataPerWrite_GemmM,
+                           true);
 }
 
-std::tuple<int, int, bool>
+std::tuple<int, int, int, int, bool>
 PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformanceParameters(
     const ConvolutionContext& ctx) const
 {
+    int ClusterLengths_GemmK  = 0;
+    int ClusterLengths_GemmN  = 0;
     int SrcDataPerRead_GemmN  = amd_buffer_load_max_length<float>();
     int DstDataPerWrite_GemmN = amd_lds_write_max_length<float>();
 
     try
     {
+        const auto WaveSize = 64;
+        const auto BlockSize =
+            GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize;
+
         SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, GemmNPerBlock);
 
         // calculate vector length on gemmn dimension
@@ -116,9 +144,8 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara
         // \todo too conversative
         if(y == 1 && x == 1)
         {
-            const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx);
-            const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx);
-
+            const auto ho        = ConvolutionContextInterpreter::GetOutputHeightHo(ctx);
+            const auto wo        = ConvolutionContextInterpreter::GetOutputWidthWo(ctx);
             SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, ho * wo);
         }
         else
@@ -127,7 +154,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara
         }
 
         // calculate threadwise copy size
-        int b_data_per_thread_copy = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
+        int b_data_per_thread_copy = (GemmKPerBlock * GemmNPerBlock) / BlockSize;
 
         if(!(b_data_per_thread_copy > 0))
             MIOPEN_THROW("invalid performance parameter");
@@ -135,16 +162,85 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara
         // GemmBBlockCopySrcDataPerRead_GemmN also bounded by size of threadwise copy
         SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, b_data_per_thread_copy);
 
+        const auto b_data_per_thread_copy_gemmn = SrcDataPerRead_GemmN;
+        const auto b_data_per_thread_copy_gemmk =
+            b_data_per_thread_copy / b_data_per_thread_copy_gemmn;
+
         // GemmBBlockCopyDstDataPerWrite_GemmN also bounded by size of threadwise copy
-        DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, SrcDataPerRead_GemmN);
+        DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, b_data_per_thread_copy_gemmn);
+
+        // calculate blockwise copy thread cluster lengths
+        ClusterLengths_GemmK = GemmKPerBlock / b_data_per_thread_copy_gemmk;
+        ClusterLengths_GemmN = GemmNPerBlock / b_data_per_thread_copy_gemmn;
+
+        if(!(ClusterLengths_GemmK > 0 && ClusterLengths_GemmN > 0))
+            MIOPEN_THROW("invalid performance parameter");
     }
     catch(...)
     {
         MIOPEN_LOG_I("catch");
-        return std::make_tuple(-1, -1, false);
+        return std::make_tuple(-1, -1, -1, -1, false);
     }
 
-    return std::make_tuple(SrcDataPerRead_GemmN, DstDataPerWrite_GemmN, true);
+    return std::make_tuple(ClusterLengths_GemmK,
+                           ClusterLengths_GemmN,
+                           SrcDataPerRead_GemmN,
+                           DstDataPerWrite_GemmN,
+                           true);
+}
+
+std::tuple<std::size_t, bool> PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateLdsNumberOfByte(
+    const ConvolutionContext& ctx) const
+{
+    std::size_t lds_size = 0;
+
+    try
+    {
+        bool valid = false;
+
+        int GemmABlockCopyClusterLengths_GemmM  = 0;
+        int GemmABlockCopyDescDataPerWriteGemmM = 0;
+        std::tie(std::ignore,
+                 GemmABlockCopyClusterLengths_GemmM,
+                 std::ignore,
+                 GemmABlockCopyDescDataPerWriteGemmM,
+                 valid) = CalculateGemmABlockCopyPerformanceParameters(ctx);
+
+        if(!valid)
+            MIOPEN_THROW("invalid performance parameter");
+
+        int GemmBBlockCopyClusterLengths_GemmN  = 0;
+        int GemmBBlockCopyDescDataPerWriteGemmN = 0;
+        std::tie(std::ignore,
+                 GemmBBlockCopyClusterLengths_GemmN,
+                 std::ignore,
+                 GemmBBlockCopyDescDataPerWriteGemmN,
+                 valid) = CalculateGemmBBlockCopyPerformanceParameters(ctx);
+
+        if(!valid)
+            MIOPEN_THROW("invalid performance parameter");
+
+        const auto ThreadGemmDataPerRead_GemmM = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
+        const auto ThreadGemmDataPerRead_GemmN = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
+
+        const auto max_lds_align = lcm(GemmABlockCopyDescDataPerWriteGemmM,
+                                       GemmBBlockCopyDescDataPerWriteGemmN,
+                                       ThreadGemmDataPerRead_GemmM,
+                                       ThreadGemmDataPerRead_GemmN);
+
+        const auto a_block_space =
+            GemmKPerBlock * integer_least_multiple(GemmMPerBlock, max_lds_align);
+        const auto b_block_space =
+            GemmKPerBlock * integer_least_multiple(GemmNPerBlock, max_lds_align);
+
+        lds_size = 2 * (a_block_space + b_block_space) * sizeof(float);
+    }
+    catch(...)
+    {
+        return std::make_tuple(0, false);
+    }
+
+    return std::make_tuple(lds_size, true);
 }
 
 bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx) const
@@ -165,54 +261,45 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext&
              GemmK % (GemmKPerBlock * GemmKBlocks) == 0))
             return false; // wrong! cannot divice N evenly among thread
     }
+    // heuristic to reduce search space
+    {
+        // use largest XdlopsGemm
+        if(GemmMPerBlock >= 64 && GemmMPerWave != 64)
+            return false;
+        if(GemmNPerBlock >= 64 && GemmNPerWave != 64)
+            return false;
+        if((GemmMPerBlock == 32 || GemmMPerBlock == 16) && GemmMPerWave != GemmMPerBlock)
+            return false;
+        if((GemmNPerBlock == 32 || GemmNPerBlock == 16) && GemmNPerWave != GemmNPerBlock)
+            return false;
+    }
 
-    if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 &&
-         GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 &&
-         GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 &&
-         GemmMPerBlock % GemmABlockCopyClusterLengths_GemmM == 0))
-        return false;
+    if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 && GemmK % GemmKPerBlock == 0))
+        return false; // wrong! cannot divice N evenly among thread
 
-    // unsupported xdlops-gemm
-    if(GemmMPerWave == 16 && GemmNPerWave == 32)
-        return false;
-    if(GemmMPerWave == 32 && GemmNPerWave == 16)
-        return false;
-    if(GemmMPerWave == 8 && GemmNPerWave != 64)
-        return false;
-    if(GemmMPerWave == 4 && GemmNPerWave != 64)
+    if(!IsValidXdlopsGemm(GemmMPerBlock, GemmNPerBlock, GemmKPerBlock, GemmMPerWave, GemmNPerWave))
         return false;
 
-    const auto WaveSize  = 64;
-    const auto BlockSize = GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize;
+    bool valid = false;
 
-    // fail with blockSize >= 512
-    /// \todo fix the issue with blockSize >= 512
-    if(BlockSize < 64 || BlockSize > 256)
-        return false;
+    // check blockwise copy of A matrix
+    std::tie(std::ignore, std::ignore, std::ignore, std::ignore, valid) =
+        CalculateGemmABlockCopyPerformanceParameters(ctx);
 
-    if(BlockSize != GemmBBlockCopyClusterLengths_GemmK * GemmBBlockCopyClusterLengths_GemmN)
+    if(!valid)
         return false;
 
-    if(BlockSize != GemmABlockCopyClusterLengths_GemmM * GemmABlockCopyClusterLengths_GemmK)
-        return false;
+    // check blockwise copy of B matrix
+    std::tie(std::ignore, std::ignore, std::ignore, std::ignore, valid) =
+        CalculateGemmBBlockCopyPerformanceParameters(ctx);
 
-    if((GemmMPerBlock % GemmMPerWave) != 0 || (GemmNPerBlock % GemmNPerWave) != 0)
+    if(!valid)
         return false;
 
-    const auto GemmBBlockCopyThreadSliceLengths_GemmN =
-        GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
-    const auto GemmABlockCopyThreadSliceLengths_GemmM =
-        GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
-    const auto lds_size = ComputeLDSRequiredSize(ctx,
-                                                 GemmNPerBlock,
-                                                 GemmMPerBlock,
-                                                 GemmKPerBlock,
-                                                 1,
-                                                 1,
-                                                 GemmBBlockCopyThreadSliceLengths_GemmN,
-                                                 GemmABlockCopyThreadSliceLengths_GemmM,
-                                                 GetEPackLength(ctx, true));
-    return lds_size <= 64 * 1024;
+    std::size_t lds_size = 0;
+    std::tie(lds_size, valid) = CalculateLdsNumberOfByte(ctx);
+
+    return (valid and lds_size <= 64 * 1024);
 }
 
 PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare)
@@ -224,12 +311,6 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo
     GemmMPerWave = spare ? 4 : 64;
     GemmNPerWave = spare ? 16 : 64;
 
-    GemmBBlockCopyClusterLengths_GemmK = 4;
-    GemmBBlockCopyClusterLengths_GemmN = 4;
-
-    GemmABlockCopyClusterLengths_GemmK = 2;
-    GemmABlockCopyClusterLengths_GemmM = 4;
-
     use_spare_set = spare;
 }
 
@@ -239,20 +320,12 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo
     int GemmKPerBlock_,
     int GemmMPerWave_,
     int GemmNPerWave_,
-    int GemmBBlockCopyClusterLengths_GemmK_,
-    int GemmBBlockCopyClusterLengths_GemmN_,
-    int GemmABlockCopyClusterLengths_GemmK_,
-    int GemmABlockCopyClusterLengths_GemmM_,
     bool use_spare_set_)
     : GemmNPerBlock(GemmNPerBlock_),
       GemmMPerBlock(GemmMPerBlock_),
       GemmKPerBlock(GemmKPerBlock_),
       GemmMPerWave(GemmMPerWave_),
       GemmNPerWave(GemmNPerWave_),
-      GemmBBlockCopyClusterLengths_GemmK(GemmBBlockCopyClusterLengths_GemmK_),
-      GemmBBlockCopyClusterLengths_GemmN(GemmBBlockCopyClusterLengths_GemmN_),
-      GemmABlockCopyClusterLengths_GemmK(GemmABlockCopyClusterLengths_GemmK_),
-      GemmABlockCopyClusterLengths_GemmM(GemmABlockCopyClusterLengths_GemmM_),
       use_spare_set(use_spare_set_)
 {
 }
@@ -266,10 +339,6 @@ operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const
         && GemmKPerBlock == other.GemmKPerBlock
         && GemmMPerWave == other.GemmMPerWave
         && GemmNPerWave == other.GemmNPerWave
-        && GemmBBlockCopyClusterLengths_GemmK == other.GemmBBlockCopyClusterLengths_GemmK
-        && GemmBBlockCopyClusterLengths_GemmN == other.GemmBBlockCopyClusterLengths_GemmN
-        && GemmABlockCopyClusterLengths_GemmK == other.GemmABlockCopyClusterLengths_GemmK
-        && GemmABlockCopyClusterLengths_GemmM == other.GemmABlockCopyClusterLengths_GemmM
         && use_spare_set == other.use_spare_set;
     // clang-format on
 }
@@ -281,11 +350,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const
         && IsTwoPower<4,128>(GemmMPerBlock)
         && IsTwoPower<4,32>(GemmKPerBlock)
         && IsTwoPower<4,64>(GemmMPerWave)
-        && IsTwoPower<16,64>(GemmNPerWave)
-        && IsTwoPower<4,16>(GemmBBlockCopyClusterLengths_GemmK)
-        && IsTwoPower<4,64>(GemmBBlockCopyClusterLengths_GemmN)
-        && IsTwoPower<2,16>(GemmABlockCopyClusterLengths_GemmK)
-        && IsTwoPower<4,128>(GemmABlockCopyClusterLengths_GemmM); // clang-format on
+        && IsTwoPower<16,64>(GemmNPerWave); // clang-format on
 }
 
 bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
@@ -314,6 +379,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
             if(!NextTwoPower<16, 64>(GemmNPerWave))
                 break;
         }
+#if 0
         if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK))
             break;
         if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN))
@@ -322,6 +388,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
             break;
         if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM))
             break;
+#endif
         return false;
     } while(false);
 
@@ -331,25 +398,23 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
 void PerformanceImplicitGemmBwdDataV4R1Xdlops::EuristicInit(const ConvolutionContext& ctx)
 {
     PerformanceImplicitGemmBwdDataV4R1Xdlops tmp;
-    tmp = {128, 128, 8, 64, 64, 4, 64, 4, 64, use_spare_set};
+    tmp = {128, 128, 8, 64, 64, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {64, 32, 4, 32, 64, 4, 16, 2, 32, use_spare_set};
+        tmp = {64, 32, 4, 32, 64, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {64, 32, 4, 32, 64, 4, 16, 4, 16, use_spare_set};
+        tmp = {32, 64, 4, 64, 32, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {32, 64, 4, 64, 32, 4, 16, 4, 16, use_spare_set};
+        tmp = {32, 32, 4, 32, 32, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {32, 32, 4, 32, 32, 4, 16, 2, 32, use_spare_set};
+        tmp = {64, 16, 4, 16, 64, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {64, 16, 4, 16, 64, 4, 16, 4, 16, use_spare_set};
+        tmp = {16, 64, 4, 64, 16, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {16, 64, 4, 64, 16, 4, 16, 4, 16, use_spare_set};
+        tmp = {16, 16, 4, 16, 16, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {16, 16, 4, 16, 16, 4, 16, 4, 16, use_spare_set};
+        tmp = {64, 4, 16, 4, 64, use_spare_set};
     if(!tmp.IsValid(ctx))
-        tmp = {64, 4, 16, 4, 64, 16, 4, 16, 4, use_spare_set};
-    if(!tmp.IsValid(ctx))
-        tmp = {64, 8, 8, 8, 64, 4, 16, 8, 8, use_spare_set};
+        tmp = {64, 8, 8, 8, 64, use_spare_set};
     if(!tmp.IsValid(ctx))
     {
         MIOPEN_LOG_E("All attempts failed");
@@ -608,16 +673,24 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
             // TODO: add fp16 calculation by GetWorkspaceSize(ctx);
             result.workspce_sz = 0;
 
-            std::size_t GemmABlockCopySrcDataPerRead_GemmM  = 1;
-            std::size_t GemmABlockCopyDstDataPerWrite_GemmM = 1;
-            std::size_t GemmBBlockCopySrcDataPerRead_GemmN  = 1;
-            std::size_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
-
-            std::tie(GemmABlockCopySrcDataPerRead_GemmM,
+            int GemmABlockCopySrcDataPerRead_GemmM  = 1;
+            int GemmABlockCopyDstDataPerWrite_GemmM = 1;
+            int GemmBBlockCopySrcDataPerRead_GemmN  = 1;
+            int GemmBBlockCopyDstDataPerWrite_GemmN = 1;
+            int GemmABlockCopyClusterLengths_GemmK  = 0;
+            int GemmABlockCopyClusterLengths_GemmM  = 0;
+            int GemmBBlockCopyClusterLengths_GemmK  = 0;
+            int GemmBBlockCopyClusterLengths_GemmN  = 0;
+
+            std::tie(GemmABlockCopyClusterLengths_GemmK,
+                     GemmABlockCopyClusterLengths_GemmM,
+                     GemmABlockCopySrcDataPerRead_GemmM,
                      GemmABlockCopyDstDataPerWrite_GemmM,
                      std::ignore) = config.CalculateGemmABlockCopyPerformanceParameters(ctx);
 
-            std::tie(GemmBBlockCopySrcDataPerRead_GemmN,
+            std::tie(GemmBBlockCopyClusterLengths_GemmK,
+                     GemmBBlockCopyClusterLengths_GemmN,
+                     GemmBBlockCopySrcDataPerRead_GemmN,
                      GemmBBlockCopyDstDataPerWrite_GemmN,
                      std::ignore) = config.CalculateGemmBBlockCopyPerformanceParameters(ctx);
 
@@ -648,12 +721,12 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) +
                 std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) +
                 std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmK) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmM) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(GemmABlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(GemmABlockCopyClusterLengths_GemmM) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM     ) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmK) +
-                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmN) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(GemmBBlockCopyClusterLengths_GemmK) +
+                std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(GemmBBlockCopyClusterLengths_GemmN) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN     ) +
                 std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) +
                 std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) +

From 515105047602cf8956e19f901a6d1ed5c456aa23 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Fri, 8 May 2020 17:19:02 +0800
Subject: [PATCH 16/18] clang format for src/solver.cpp

---
 src/solver.cpp | 1 -
 1 file changed, 1 deletion(-)
 mode change 100755 => 100644 src/solver.cpp

diff --git a/src/solver.cpp b/src/solver.cpp
old mode 100755
new mode 100644
index 45f050f4d1..ca0cf35916
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -315,7 +315,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemmBwdDataV4R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM);
-
 }
 
 } // namespace solver

From d068e9affb68f22e3a58f0d3544115ce6dc8a566 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Sat, 9 May 2020 10:19:18 +0800
Subject: [PATCH 17/18] remove non-used code

---
 src/include/miopen/solver.hpp                        | 12 ------------
 .../conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp  | 10 ----------
 2 files changed, 22 deletions(-)
 mode change 100644 => 100755 src/include/miopen/solver.hpp
 mode change 100644 => 100755 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
old mode 100644
new mode 100755
index d17cfae274..40bae8c223
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -677,13 +677,7 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
 
     int GemmMPerWave;
     int GemmNPerWave;
-#if 0
-    int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16]
-    int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64]
 
-    int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4]
-    int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128]
-#endif
     bool use_spare_set;
 
     PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, bool);
@@ -710,12 +704,6 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
         f(self.GemmKPerBlock, "GemmKPerBlock");
         f(self.GemmMPerWave, "GemmMPerWave");
         f(self.GemmNPerWave, "GemmNPerWave");
-#if 0
-        f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK");
-        f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN");
-        f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK");
-        f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM");
-#endif
     }
 
     std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
old mode 100644
new mode 100755
index 0273e418b5..439a634775
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -379,16 +379,6 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue()
             if(!NextTwoPower<16, 64>(GemmNPerWave))
                 break;
         }
-#if 0
-        if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK))
-            break;
-        if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN))
-            break;
-        if(!NextTwoPower<2, 16>(GemmABlockCopyClusterLengths_GemmK))
-            break;
-        if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM))
-            break;
-#endif
         return false;
     } while(false);
 

From abf7fd5b770865500512a05e6f39d2055d4e2c50 Mon Sep 17 00:00:00 2001
From: shaojiewang <wsjmessi@163.com>
Date: Wed, 13 May 2020 11:40:28 +0000
Subject: [PATCH 18/18] remove workaround issues for v4r1 xdlops kernel

---
 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
index 439a634775..08855f1780 100755
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
@@ -505,10 +505,6 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ConvolutionContext
 
 bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx) const
 {
-#if WORKAROUND_SWDEV_229277_227616_229195
-    if(!IsHccCompiler())
-        return false;
-#endif
     bool is_applicable = true;
 
     if(!ctx.direction.IsBackwardData())