From ba837fc7e4403bf8653bd164446609022f011366 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 20 Apr 2020 02:24:48 +0000 Subject: [PATCH 01/18] add igemm bwd v4r1 xdlops kernel --- src/CMakeLists.txt | 1 + src/include/miopen/solver.hpp | 87 +++ ...plicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp | 434 +++++++++++ ...plicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp | 143 ++++ src/mlo_dir_conv.cpp | 3 +- src/ocl/convolutionocl.cpp | 4 +- src/solver.cpp | 3 + ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 688 ++++++++++++++++++ 8 files changed, 1361 insertions(+), 2 deletions(-) mode change 100644 => 100755 src/CMakeLists.txt mode change 100644 => 100755 src/include/miopen/solver.hpp create mode 100755 src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp create mode 100755 src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp mode change 100644 => 100755 src/mlo_dir_conv.cpp mode change 100644 => 100755 src/ocl/convolutionocl.cpp mode change 100644 => 100755 src/solver.cpp create mode 100755 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new mode 100755 index c39ac496d6..853b403ae8 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -216,6 +216,7 @@ set( MIOpen_Source solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp + solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp ) list(APPEND MIOpen_Source tmp_dir.cpp binary_cache.cpp md5.cpp) diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp old mode 100644 new mode 100755 index bdddb8919b..fef33514ba --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -668,6 +668,70 @@ struct PerformanceImplicitGemmBwdDataV4R1 : Serializable +{ + int GemmNPerBlock; // 2^n[8..16] + int GemmMPerBlock; // 2^n[32..128] + int GemmKPerBlock; // 2^n[4..16] + + int GemmMPerWave; + int GemmNPerWave; + + int InBlockCopyClusterLengths_GemmK; // 2^n[4..16] + int InBlockCopyClusterLengths_GemmN; // 2^n[8..64] + + int WeiBlockCopyClusterLengths_GemmK; // 2^n[1..4] + int WeiBlockCopyClusterLengths_GemmM; // 2^n[16..128] + + bool use_spare_set; + + PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, int, int, int, int, bool); + + PerformanceImplicitGemmBwdDataV4R1Xdlops() + : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, -1, -1, -1, -1, false) + { + } + + PerformanceImplicitGemmBwdDataV4R1Xdlops( + int a, int b, int c, int d, int e, int f, int g, int h, int i) + : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, f, g, h, i, false) + { + } + + PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare); + + bool operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const; + + template + static void Visit(Self&& self, F f) + { + f(self.GemmNPerBlock, "GemmNPerBlock"); + f(self.GemmMPerBlock, "GemmMPerBlock"); + f(self.GemmKPerBlock, "GemmKPerBlock"); + f(self.GemmMPerWave, "GemmMPerWave"); + f(self.GemmNPerWave, "GemmNPerWave"); + f(self.InBlockCopyClusterLengths_GemmK, "InBlockCopyClusterLengths_GemmK"); + f(self.InBlockCopyClusterLengths_GemmN, "InBlockCopyClusterLengths_GemmN"); + f(self.WeiBlockCopyClusterLengths_GemmK, "WeiBlockCopyClusterLengths_GemmK"); + f(self.WeiBlockCopyClusterLengths_GemmM, "WeiBlockCopyClusterLengths_GemmM"); + } + + std::tuple CalculateGridSize(const ConvolutionContext& ctx) const; + std::tuple + CalculateBlockGemmPerformanceParameters(const ConvolutionContext& ctx) const; + std::tuple + CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const; + std::tuple + CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const; + // std::tuple CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const; + bool IsValidValue() const; + bool IsValid(const ConvolutionContext& ctx) const; + void EuristicInit(const ConvolutionContext& ctx); + bool SetNextValue(); + std::string ToString() const; +}; + struct ConvHipImplicitGemmV4R1Fwd : SolverBase { PerformanceImplicitGemmV4R1 GetPerformanceConfig(const ConvolutionContext& ctx) const; @@ -997,6 +1061,29 @@ struct ConvHipImplicitGemmBwdDataV4R1 : SolverBase bool disableConfigOverrideFromEnv = false) const; }; +struct ConvHipImplicitGemmBwdDataV4R1Xdlops : SolverBase +{ + static int CalculateNumberOfGemm(const ConvolutionContext& ctx); + static std::tuple CalculateGemmSize(const ConvolutionContext& ctx, int gemm_id); + PerformanceImplicitGemmBwdDataV4R1Xdlops + GetPerformanceConfig(const ConvolutionContext& ctx) const; + bool IsValidPerformanceConfig(const ConvolutionContext& ctx, + const PerformanceImplicitGemmBwdDataV4R1Xdlops& c) const; + bool IsApplicable(const ConvolutionContext& ctx) const; + ConvSolution GetSolution(const ConvolutionContext& ctx, + const PerformanceImplicitGemmBwdDataV4R1Xdlops& config, + bool disableConfigOverrideFromEnv = false) const; + PerformanceImplicitGemmBwdDataV4R1Xdlops Search(const ConvolutionContext&) const; + int RunAndMeasureSolution(miopen::Handle& profile_h, + ConstData_t bot_buf, + Data_t top_buf, + ConstData_t wei_buf, + ConstData_t bias_buf, + const ConvolutionContext& ctx, + const ConvSolution& solution, + float& elapsed_time) const; +}; + struct ConvHipImplicitGemmBwdDataV1R1Xdlops : SolverBase { PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const; diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp new file mode 100755 index 0000000000..789a24c68f --- /dev/null +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp @@ -0,0 +1,434 @@ +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_NCHW_KCYX_NKHW_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R1_XDLOPS_NCHW_KCYX_NKHW_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm_xdlops.hpp" + +namespace ck { + +// Number of GEMMs: YTilda * XTilda +// GemmM = C +// GemmN = N * HTildaSlice * WTildaSlice +// GemmK = K * YDotSlice * XDotSlice +template +struct GridwiseConvolutionBackwardDataImplicitGemm_v4r1_xdlops_nchw_kcyx_nkhw +{ + __host__ __device__ static constexpr index_t GetNumberOfGemm() + { + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); + constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); + + constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH; + constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW; + + return YTilda * XTilda; + } + + __host__ __device__ static constexpr auto GetGemmSizeImpl(index_t iYTilda, index_t iXTilda) + { + constexpr index_t N = InGlobalDesc::GetLengths()[0]; + constexpr index_t C = InGlobalDesc::GetLengths()[1]; + constexpr index_t Hi = InGlobalDesc::GetLengths()[2]; + constexpr index_t Wi = InGlobalDesc::GetLengths()[3]; + + constexpr index_t K = OutGlobalDesc::GetLengths()[1]; + constexpr index_t Ho = OutGlobalDesc::GetLengths()[2]; + constexpr index_t Wo = OutGlobalDesc::GetLengths()[3]; + + constexpr index_t Y = WeiGlobalDesc::GetLengths()[2]; + constexpr index_t X = WeiGlobalDesc::GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); + constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); + + constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH; + constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW; + + constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda); + constexpr index_t XDot = math::integer_divide_ceil(X, XTilda); + + constexpr index_t HTilda = + Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH); + constexpr index_t WTilda = + Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW); + + // only work on HTilda and WTilda that contribute to non-padding area of input tensor + constexpr index_t iHTildaLeft = math::integer_divide_floor( + math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]); + constexpr index_t iWTildaLeft = math::integer_divide_floor( + math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]); + + constexpr index_t iHTildaRight = math::min( + HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1); + constexpr index_t iWTildaRight = math::min( + WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1); + + constexpr index_t HTildaSlice = iHTildaRight - iHTildaLeft; + constexpr index_t WTildaSlice = iWTildaRight - iWTildaLeft; + + // GemmM and GemmN + constexpr index_t GemmM = C; + constexpr index_t GemmN = N * HTildaSlice * WTildaSlice; + + // GemmK is different for each GEMM + index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot; + index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot; + + index_t GemmK = K * YDotSlice * XDotSlice; + + return Array{GemmM, GemmN, GemmK}; + } + + __host__ __device__ static constexpr auto GetGemmSize(index_t gemm_id) + { + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); + + constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW; + + index_t iYTilda = gemm_id / XTilda; + index_t iXTilda = gemm_id % XTilda; + + return GetGemmSizeImpl(iYTilda, iXTilda); + } + + template + __device__ static void RunImpl(Float* __restrict__ p_in_global, + const Float* __restrict__ p_wei_global, + const Float* __restrict__ p_out_global) + { + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + //\todo static_assert for global vector load/store + // statc_assert(); + + constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); + constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); + + constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH; + constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW; + + constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda); + constexpr index_t XDot = math::integer_divide_ceil(X, XTilda); + + constexpr index_t HTilda = + Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH); + constexpr index_t WTilda = + Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW); + + // only work on HTilda and WTilda that contribute to non-padding area of input tensor + constexpr index_t iHTildaLeft = math::integer_divide_floor( + math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]); + constexpr index_t iWTildaLeft = math::integer_divide_floor( + math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]); + + constexpr index_t iHTildaRight = math::min( + HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1); + constexpr index_t iWTildaRight = math::min( + WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1); + + constexpr index_t HTildaSlice = iHTildaRight - iHTildaLeft; + constexpr index_t WTildaSlice = iWTildaRight - iWTildaLeft; + + // weight out-of-bound check can be skipped + constexpr bool wei_skip_out_of_bound_check = true; + + // weight tensor + constexpr auto wei_k_c_ydot_ytilda_xdot_xtilda_global_desc = transform_tensor_descriptor( + wei_k_c_y_x_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, + Sequence, + wei_skip_out_of_bound_check>{}, + Embed, + Sequence, + wei_skip_out_of_bound_check>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + +#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK + constexpr bool out_skip_out_of_bound_check = false; +#else + //\todo sometimes output tensor out-of-bound check can be skipped, find out all such + // situations + constexpr bool out_skip_out_of_bound_check = true; +#endif + + // output tensor + constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, + Sequence<-ConvDilationH / GcdStrideDilationH, 1, 0>, + out_skip_out_of_bound_check>{}, + Embed, + Sequence<-ConvDilationW / GcdStrideDilationW, 1, 0>, + out_skip_out_of_bound_check>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc = + transform_tensor_descriptor( + out_n_k_ydot_htilda_xdot_wtilda_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + PassThrough{}, + PassThrough{}, + Slice, + Sequence, + Sequence>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{})); + +#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK + constexpr bool in_skip_out_of_bound_check = false; +#else + //\todo sometimes input out-of-bound check can be skipped, find out all such situations + constexpr bool in_skip_out_of_bound_check = true; +#endif + + // input tensor + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Pad, InLeftPads, InRightPads, in_skip_out_of_bound_check>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2]; + constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3]; + + constexpr auto in_n_c_ytilda_htilda_xtilda_wtilda_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, + Sequence, + in_skip_out_of_bound_check>{}, + Embed, + Sequence, + in_skip_out_of_bound_check>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto in_n_c_ytilda_htildaslice_xtilda_wtildaslice_global_desc = + transform_tensor_descriptor( + in_n_c_ytilda_htilda_xtilda_wtilda_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + PassThrough{}, + PassThrough{}, + Slice, + Sequence, + Sequence>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{})); + + // GEMM + constexpr index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot; + constexpr index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot; + + // A matrix + constexpr auto wei_k_c_ydotslice_ytidaslice_xdotslice_xtildaslice_global_desc = + transform_tensor_descriptor( + wei_k_c_ydot_ytilda_xdot_xtilda_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Slice, Sequence<0, 0>, Sequence>{}, + Slice, + Sequence, + Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3, 5>{})); + + constexpr auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor( + wei_k_c_ydotslice_ytidaslice_xdotslice_xtildaslice_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // B matrix + constexpr auto out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc = + transform_tensor_descriptor( + out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + PassThrough{}, + PassThrough{}, + Slice, Sequence<0, 0>, Sequence>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{})); + + constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor( + out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc, + make_tuple(Merge>{}, + Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // C matrix + constexpr auto in_n_c_ytildaslice_htildaslice_xtildaslice_wtildaslice_global_desc = + transform_tensor_descriptor( + in_n_c_ytilda_htildaslice_xtilda_wtildaslice_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + PassThrough{}, + PassThrough{}, + Slice, + Sequence, + Sequence>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{})); + + constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor( + in_n_c_ytildaslice_htildaslice_xtildaslice_wtildaslice_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + constexpr auto gridwise_gemm = GridwiseGemmTransposedANormalBNormalCXdlops_v1< + GridSize, + BlockSize, + Float, + AccFloat, + decltype(wei_gemmk_gemmm_global_desc), + decltype(out_gemmk_gemmn_global_desc), + decltype(in_gemmm_gemmn_global_desc), + GemmMPerBlock, + GemmNPerBlock, + GemmKPerBlock, + GemmMPerWave, + GemmNPerWave, + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN, + GemmABlockCopyThreadSliceLengths_GemmK_GemmM, + GemmABlockCopyThreadClusterLengths_GemmK_GemmM, + Sequence<1, 0>, + Sequence<1, 0>, + Sequence<0, 1>, + 1, + GemmABlockCopySrcDataPerRead_GemmM, + GemmABlockCopyDstDataPerWrite_GemmM, + GemmBBlockCopyThreadSliceLengths_GemmK_GemmN, + GemmBBlockCopyThreadClusterLengths_GemmK_GemmN, + Sequence<0, 1>, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + GemmBBlockCopySrcDataPerRead_GemmN, + GemmBBlockCopyDstDataPerWrite_GemmN, + InMemoryDataOperation::Set>{}; + + gridwise_gemm.Run(p_wei_global, p_out_global, p_in_global); + } + + template + __device__ static void Run(Float* __restrict__ p_in_global, + const Float* __restrict__ p_wei_global, + const Float* __restrict__ p_out_global) + { + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); + constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); + + constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH; + constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW; + + constexpr index_t iYTilda = GemmId / XTilda; + constexpr index_t iXTilda = GemmId % XTilda; + + static_assert(iYTilda < YTilda && iXTilda < XTilda, "wrong! iYtilda, iXtilda"); + + RunImpl(p_in_global, p_wei_global, p_out_global); + } +}; + +} // namespace ck +#endif diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp new file mode 100755 index 0000000000..61a54a0c46 --- /dev/null +++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp @@ -0,0 +1,143 @@ +#include "common_header.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.hpp" +#include "float_types.h" + +extern "C" __global__ + __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) +{ + using namespace ck; + + // read problem parameters + constexpr index_t N = CK_PARAM_PROBLEM_N; + constexpr index_t K = CK_PARAM_PROBLEM_K; + constexpr index_t C = CK_PARAM_PROBLEM_C; + constexpr index_t Hi = CK_PARAM_PROBLEM_HI; + constexpr index_t Wi = CK_PARAM_PROBLEM_WI; + constexpr index_t Ho = CK_PARAM_PROBLEM_HO; + constexpr index_t Wo = CK_PARAM_PROBLEM_WO; + constexpr index_t Y = CK_PARAM_PROBLEM_Y; + constexpr index_t X = CK_PARAM_PROBLEM_X; + + constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H; + constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W; + + constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H; + constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W; + + constexpr index_t InLeftPadH = CK_PARAM_PROBLEM_IN_LEFT_PAD_H; + constexpr index_t InLeftPadW = CK_PARAM_PROBLEM_IN_LEFT_PAD_W; + + constexpr index_t InRightPadH = CK_PARAM_PROBLEM_IN_RIGHT_PAD_H; + constexpr index_t InRightPadW = CK_PARAM_PROBLEM_IN_RIGHT_PAD_W; + + constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE; + constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE; + + constexpr index_t GemmMPerBlock = CK_PARAM_TUNABLE_GEMM_M_PER_BLOCK; + constexpr index_t GemmNPerBlock = CK_PARAM_TUNABLE_GEMM_N_PER_BLOCK; + constexpr index_t GemmKPerBlock = CK_PARAM_TUNABLE_GEMM_K_PER_BLOCK; + + constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto out_nkhw_desc = make_native_tensor_descriptor_packed(Sequence{}); + + using ConvStrides = Sequence; + using ConvDilations = Sequence; + + using InLeftPads = Sequence; + using InRightPads = Sequence; + + // A matrix + constexpr index_t GemmABlockCopyClusterLengths_GemmK = + CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K; + + constexpr index_t GemmABlockCopyClusterLengths_GemmM = + CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M; + + constexpr index_t GemmABlockCopyThreadSliceLengths_GemmK = + GemmKPerBlock / GemmABlockCopyClusterLengths_GemmK; + + constexpr index_t GemmABlockCopyThreadSliceLengths_GemmM = + GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; + + using GemmABlockCopyThreadSliceLengths_GemmK_GemmM = + Sequence; + + using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = + Sequence; + + constexpr index_t GemmABlockCopySrcDataPerRead_GemmM = + CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M; + + constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = + CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M; + + // B matrix + constexpr index_t GemmBBlockCopyClusterLengths_GemmK = + CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K; + + constexpr index_t GemmBBlockCopyClusterLengths_GemmN = + CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N; + + constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmK = + GemmKPerBlock / GemmBBlockCopyClusterLengths_GemmK; + + constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmN = + GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; + + using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = + Sequence; + + using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = + Sequence; + + constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = + CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N; + + constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = + CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N; + + // C matrix + constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE; + constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE; + + constexpr index_t GemmThreadGemmDataPerReadM = 1; + constexpr index_t GemmThreadGemmDataPerReadN = 1; + + constexpr auto gridwise_conv_bwd_data = + GridwiseConvolutionBackwardDataImplicitGemm_v4r1_xdlops_nchw_kcyx_nkhw< + GridSize, + BlockSize, + FLOAT, + FLOAT_ACCUM, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + InLeftPads, + InRightPads, + GemmMPerBlock, + GemmNPerBlock, + GemmKPerBlock, + GemmMPerWave, + GemmNPerWave, + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN, + GemmABlockCopyThreadSliceLengths_GemmK_GemmM, + GemmABlockCopyThreadClusterLengths_GemmK_GemmM, + GemmABlockCopySrcDataPerRead_GemmM, + GemmABlockCopyDstDataPerWrite_GemmM, + GemmBBlockCopyThreadSliceLengths_GemmK_GemmN, + GemmBBlockCopyThreadClusterLengths_GemmK_GemmN, + GemmBBlockCopySrcDataPerRead_GemmN, + GemmBBlockCopyDstDataPerWrite_GemmN>{}; + + // these decide which GEMM will be called + constexpr index_t GemmId = CK_PARAM_GEMM_ID; + + gridwise_conv_bwd_data.template Run(p_in_global, p_wei_global, p_out_global); +} diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp old mode 100644 new mode 100755 index 02769b7723..f128e5a83b --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -139,7 +139,8 @@ static auto GetImplicitGemmSolvers() miopen::solver::ConvHipImplicitGemmV4R1Fwd, miopen::solver::ConvHipImplicitGemmV4R4Fwd, miopen::solver::ConvHipImplicitGemmBwdDataV1R1, - miopen::solver::ConvHipImplicitGemmBwdDataV4R1>{}; + miopen::solver::ConvHipImplicitGemmBwdDataV4R1, + miopen::solver::ConvHipImplicitGemmBwdDataV4R1Xdlops>{}; } static auto GetWindogradSolvers() diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp old mode 100644 new mode 100755 index 94f3f663e8..e2f6324af2 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -3104,7 +3104,9 @@ void ConvBwdImplicitGemm(const ConvolutionContext& /*ctx*/, elapsed += handle.GetKernelTime(); } else if(kernel.GetName() == - "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw") + "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" || + kernel.GetName() == + "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw") { // \todo this kernel doesn't always need to set-zero float zero = 0.f; diff --git a/src/solver.cpp b/src/solver.cpp old mode 100644 new mode 100755 index cbe6066db0..84be961d1a --- a/src/solver.cpp +++ b/src/solver.cpp @@ -302,6 +302,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) RegisterWithSolver( registry, ++id, ConvHipImplicitGemmBwdDataV1R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM); + + RegisterWithSolver( + registry, ++id, ConvHipImplicitGemmBwdDataV4R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM); } } // namespace solver diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp new file mode 100755 index 0000000000..2f8858f71a --- /dev/null +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -0,0 +1,688 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include "miopen/solver.hpp" +#include "miopen/handle.hpp" +#include +#include "implicitgemm_util.hpp" + +namespace miopen { +namespace solver { + +std::tuple +PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGridSize(const ConvolutionContext& ctx) const +{ + int GridSize = 0; + + try + { + int gemm_m = 0; + int gemm_n = 0; + + std::tie(gemm_m, gemm_n, std::ignore) = + ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(ctx, 0); + + if(!(gemm_m % GemmMPerBlock == 0 && gemm_n % GemmNPerBlock == 0)) + MIOPEN_THROW("invalid performance parameter"); + + GridSize = (gemm_m / GemmMPerBlock) * (gemm_n / GemmNPerBlock); + } + catch(...) + { + return std::make_tuple(-1, false); + } + + return std::make_tuple(GridSize, true); +} + +std::tuple +PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformanceParameters( + const ConvolutionContext& ctx) const +{ + int SrcDataPerRead_GemmM = amd_buffer_load_max_length(); + int DstDataPerWrite_GemmM = amd_lds_write_max_length(); + + try + { + // calculate vector length on gemmk dimension + SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, GemmMPerBlock); + + const auto y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); + const auto x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); + + // \todo too conservative + if(!(y == 1 && x == 1)) + SrcDataPerRead_GemmM = 1; + + // calculate threadwise copy size + const auto a_data_per_thread_copy = GemmMPerBlock / WeiBlockCopyClusterLengths_GemmM; + + if(!(a_data_per_thread_copy > 0)) + MIOPEN_THROW("invalid performance parameter"); + + // GemmABlockCopySrcDataPerRead_GemmK also bounded by size of threadwise copy + SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, a_data_per_thread_copy); + + // GemmABlockCopyDstDataPerWrite_GemmM also bounded by size of threadwise copy + DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, SrcDataPerRead_GemmM); + } + catch(...) + { + return std::make_tuple(-1, -1, false); + } + + return std::make_tuple(SrcDataPerRead_GemmM, DstDataPerWrite_GemmM, true); +} + +std::tuple +PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformanceParameters( + const ConvolutionContext& ctx) const +{ + int SrcDataPerRead_GemmN = amd_buffer_load_max_length(); + int DstDataPerWrite_GemmN = amd_lds_write_max_length(); + + try + { + SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, GemmNPerBlock); + + // calculate vector length on gemmn dimension + const auto y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); + const auto x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); + + // \todo too conversative + if(y == 1 && x == 1) + { + const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); + const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); + + SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, ho * wo); + } + else + { + SrcDataPerRead_GemmN = 1; + } + + // calculate threadwise copy size + int b_data_per_thread_copy = GemmNPerBlock / InBlockCopyClusterLengths_GemmN; + + if(!(b_data_per_thread_copy > 0)) + MIOPEN_THROW("invalid performance parameter"); + + // GemmBBlockCopySrcDataPerRead_GemmN also bounded by size of threadwise copy + SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, b_data_per_thread_copy); + + // GemmBBlockCopyDstDataPerWrite_GemmN also bounded by size of threadwise copy + DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, SrcDataPerRead_GemmN); + } + catch(...) + { + MIOPEN_LOG_I("catch"); + return std::make_tuple(-1, -1, false); + } + + return std::make_tuple(SrcDataPerRead_GemmN, DstDataPerWrite_GemmN, true); +} + +bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx) const +{ + int GemmM = 0, GemmN = 0, GemmK = 0; + + const auto& GemmKBlocks = 1; + + // check blockwise GEMM size + for(int gemm_id = 0; gemm_id < ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateNumberOfGemm(ctx); + ++gemm_id) + { + + std::tie(GemmM, GemmN, GemmK) = + ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(ctx, gemm_id); + + if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 && + GemmK % (GemmKPerBlock * GemmKBlocks) == 0)) + return false; // wrong! cannot divice N evenly among thread + } + + const auto& GemmBBlockCopyClusterLengths_GemmK = InBlockCopyClusterLengths_GemmK; + const auto& GemmBBlockCopyClusterLengths_GemmN = InBlockCopyClusterLengths_GemmN; + const auto& GemmABlockCopyClusterLengths_GemmK = WeiBlockCopyClusterLengths_GemmK; + const auto& GemmABlockCopyClusterLengths_GemmM = WeiBlockCopyClusterLengths_GemmM; + + if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 && + GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 && + GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 && + GemmMPerBlock % GemmABlockCopyClusterLengths_GemmM == 0)) + return false; + + // unsupported xdlops-gemm + if(GemmMPerWave == 16 && GemmNPerWave == 32) + return false; + if(GemmMPerWave == 32 && GemmNPerWave == 16) + return false; + if(GemmMPerWave == 8 && GemmNPerWave != 64) + return false; + if(GemmMPerWave == 4 && GemmNPerWave != 64) + return false; + + const auto WaveSize = 64; + const auto BlockSize = GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize; + + // fail with blockSize >= 512 + /// \todo fix the issue with blockSize >= 512 + if(BlockSize < 64 || BlockSize > 256) + return false; + + if(BlockSize != GemmBBlockCopyClusterLengths_GemmK * GemmBBlockCopyClusterLengths_GemmN) + return false; + + if(BlockSize != GemmABlockCopyClusterLengths_GemmM * GemmABlockCopyClusterLengths_GemmK) + return false; + + if((GemmMPerBlock % GemmMPerWave) != 0 || (GemmNPerBlock % GemmNPerWave) != 0) + return false; + + const auto GemmBBlockCopyThreadSliceLengths_GemmN = + GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; + const auto GemmABlockCopyThreadSliceLengths_GemmM = + GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; + const auto lds_size = ComputeLDSRequiredSize(ctx, + GemmNPerBlock, + GemmMPerBlock, + GemmKPerBlock, + 1, + 1, + GemmBBlockCopyThreadSliceLengths_GemmN, + GemmABlockCopyThreadSliceLengths_GemmM, + true); + return lds_size <= 64 * 1024; +} + +PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare) +{ + GemmNPerBlock = spare ? 16 : 64; + GemmMPerBlock = spare ? 4 : 64; + GemmKPerBlock = spare ? 4 : 8; + + GemmMPerWave = spare ? 4 : 64; + GemmNPerWave = spare ? 16 : 64; + + InBlockCopyClusterLengths_GemmK = 4; + InBlockCopyClusterLengths_GemmN = 4; + + WeiBlockCopyClusterLengths_GemmK = 2; + WeiBlockCopyClusterLengths_GemmM = 4; + + use_spare_set = spare; +} + +PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops( + int BPerBlock_, + int KPerBlock_, + int EPerBlock_, + int GemmMPerWave_, + int GemmNPerWave_, + int InBlockCopyClusterLengths_E_, + int InBlockCopyClusterLengths_B_, + int WeiBlockCopyClusterLengths_E_, + int WeiBlockCopyClusterLengths_K_, + bool use_spare_set_) + : GemmNPerBlock(BPerBlock_), + GemmMPerBlock(KPerBlock_), + GemmKPerBlock(EPerBlock_), + GemmMPerWave(GemmMPerWave_), + GemmNPerWave(GemmNPerWave_), + InBlockCopyClusterLengths_GemmK(InBlockCopyClusterLengths_E_), + InBlockCopyClusterLengths_GemmN(InBlockCopyClusterLengths_B_), + WeiBlockCopyClusterLengths_GemmK(WeiBlockCopyClusterLengths_E_), + WeiBlockCopyClusterLengths_GemmM(WeiBlockCopyClusterLengths_K_), + use_spare_set(use_spare_set_) +{ +} + +bool PerformanceImplicitGemmBwdDataV4R1Xdlops:: +operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const +{ + // clang-format off + return GemmNPerBlock == other.GemmNPerBlock + && GemmMPerBlock == other.GemmMPerBlock + && GemmKPerBlock == other.GemmKPerBlock + && GemmMPerWave == other.GemmMPerWave + && GemmNPerWave == other.GemmNPerWave + && InBlockCopyClusterLengths_GemmK == other.InBlockCopyClusterLengths_GemmK + && InBlockCopyClusterLengths_GemmN == other.InBlockCopyClusterLengths_GemmN + && WeiBlockCopyClusterLengths_GemmK == other.WeiBlockCopyClusterLengths_GemmK + && WeiBlockCopyClusterLengths_GemmM == other.WeiBlockCopyClusterLengths_GemmM + && use_spare_set == other.use_spare_set; + // clang-format on +} + +bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const +{ + // clang-format off + return IsTwoPower<16,128>(GemmNPerBlock) + && IsTwoPower<4,128>(GemmMPerBlock) + && IsTwoPower<4,32>(GemmKPerBlock) + && IsTwoPower<4,64>(GemmMPerWave) + && IsTwoPower<16,64>(GemmNPerWave) + && IsTwoPower<4,16>(InBlockCopyClusterLengths_GemmK) + && IsTwoPower<4,64>(InBlockCopyClusterLengths_GemmN) + && IsTwoPower<2,16>(WeiBlockCopyClusterLengths_GemmK) + && IsTwoPower<4,128>(WeiBlockCopyClusterLengths_GemmM); // clang-format on +} + +bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() +{ + do + { + if(!use_spare_set) + { + if(!NextTwoPower<64, 128>(GemmNPerBlock)) + break; + if(!NextTwoPower<64, 128>(GemmMPerBlock)) + break; + if(!NextTwoPower<8, 32>(GemmKPerBlock)) + break; + } + else + { + if(!NextTwoPower<16, 128>(GemmNPerBlock)) + break; + if(!NextTwoPower<4, 128>(GemmMPerBlock)) + break; + if(!NextTwoPower<4, 32>(GemmKPerBlock)) + break; + if(!NextTwoPower<4, 64>(GemmMPerWave)) + break; + if(!NextTwoPower<16, 64>(GemmNPerWave)) + break; + } + if(!NextTwoPower<4, 16>(InBlockCopyClusterLengths_GemmK)) + break; + if(!NextTwoPower<4, 64>(InBlockCopyClusterLengths_GemmN)) + break; + if(!NextTwoPower<2, 16>(WeiBlockCopyClusterLengths_GemmK)) + break; + if(!NextTwoPower<4, 128>(WeiBlockCopyClusterLengths_GemmM)) + break; + return false; + } while(false); + + return true; +} + +void PerformanceImplicitGemmBwdDataV4R1Xdlops::EuristicInit(const ConvolutionContext& ctx) +{ + PerformanceImplicitGemmBwdDataV4R1Xdlops tmp; + tmp = {128, 128, 8, 64, 64, 4, 64, 4, 64, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {64, 32, 4, 32, 64, 4, 16, 2, 32, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {64, 32, 4, 32, 64, 4, 16, 4, 16, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {32, 64, 4, 64, 32, 4, 16, 4, 16, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {32, 32, 4, 32, 32, 4, 16, 2, 32, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {64, 16, 4, 16, 64, 4, 16, 4, 16, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {16, 64, 4, 64, 16, 4, 16, 4, 16, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {16, 16, 4, 16, 16, 4, 16, 4, 16, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {64, 4, 16, 4, 64, 16, 4, 16, 4, use_spare_set}; + if(!tmp.IsValid(ctx)) + tmp = {64, 8, 8, 8, 64, 4, 16, 8, 8, use_spare_set}; + if(!tmp.IsValid(ctx)) + { + MIOPEN_LOG_E("All attempts failed"); + assert(false); + } + *this = tmp; + MIOPEN_LOG_I(ToString()); +} + +std::string PerformanceImplicitGemmBwdDataV4R1Xdlops::ToString() const +{ + std::ostringstream ss; + Serialize(ss); + return ss.str(); +} + +int ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateNumberOfGemm(const ConvolutionContext& ctx) +{ + const auto conv_stride_h = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx); + const auto conv_stride_w = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx); + const auto conv_dilation_h = + ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx); + const auto conv_dilation_w = + ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx); + + const auto gcd_stride_dilation_h = gcd(conv_stride_h, conv_dilation_h); + const auto gcd_stride_dilation_w = gcd(conv_stride_w, conv_dilation_w); + + const auto ytilda = conv_stride_h / gcd_stride_dilation_h; + const auto xtilda = conv_stride_w / gcd_stride_dilation_w; + + return ytilda * xtilda; +} + +std::tuple +ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ConvolutionContext& ctx, int gemm_id) +{ + const auto n = ConvolutionContextInterpreter::GetBatchN(ctx); + const auto k = ConvolutionContextInterpreter::GetOutputChannelK(ctx); + const auto c = ConvolutionContextInterpreter::GetInputChannelC(ctx); + const auto hi = ConvolutionContextInterpreter::GetInputHeightHi(ctx); + const auto wi = ConvolutionContextInterpreter::GetInputWidthWi(ctx); + const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); + const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); + const auto y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); + const auto x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); + const auto conv_stride_h = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx); + const auto conv_stride_w = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx); + const auto conv_dilation_h = + ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx); + const auto conv_dilation_w = + ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx); + const auto in_left_pad_h = ConvolutionContextInterpreter::GetInputLeftPadH(ctx); + const auto in_left_pad_w = ConvolutionContextInterpreter::GetInputLeftPadW(ctx); + + const auto gcd_stride_dilation_h = gcd(conv_stride_h, conv_dilation_h); + const auto gcd_stride_dilation_w = gcd(conv_stride_w, conv_dilation_w); + + const auto ytilda = conv_stride_h / gcd_stride_dilation_h; + const auto xtilda = conv_stride_w / gcd_stride_dilation_w; + + const auto ydot = integer_divide_ceil(y, ytilda); + const auto xdot = integer_divide_ceil(x, xtilda); + + const auto htilda = ho + integer_divide_ceil(conv_dilation_h * (y - 1), conv_stride_h); + const auto wtilda = wo + integer_divide_ceil(conv_dilation_w * (x - 1), conv_stride_w); + + // intermediate result could be negative, use int instead of size_t + const auto htilda_left = + std::max(0, in_left_pad_h - conv_dilation_h * (ytilda - 1)) / conv_stride_h; + const auto wtilda_left = + std::max(0, in_left_pad_w - conv_dilation_w * (xtilda - 1)) / conv_stride_w; + + const auto htilda_right = + std::min(htilda, integer_divide_ceil(in_left_pad_h + hi - 1, conv_stride_h) + 1); + const auto wtilda_right = + std::min(wtilda, integer_divide_ceil(in_left_pad_w + wi - 1, conv_stride_w) + 1); + + const auto htilda_slice = htilda_right - htilda_left; + const auto wtilda_slice = wtilda_right - wtilda_left; + + // gemm_k size is different for each GEMM + const auto i_ytilda = gemm_id / xtilda; + const auto i_xtilda = gemm_id % xtilda; + + const auto ydot_slice = (i_ytilda + 1) * ydot <= y ? ydot : y % ydot; + const auto xdot_slice = (i_xtilda + 1) * xdot <= x ? xdot : x % xdot; + + const auto gemm_m = c; + const auto gemm_n = n * htilda_slice * wtilda_slice; + const auto gemm_k = k * ydot_slice * xdot_slice; + + return std::make_tuple(gemm_m, gemm_n, gemm_k); +} + +// TODO: add fp16 and bfp16 by ConvHipImplicitGemmBwdDataV4R1Xdlops::GetWorkspaceSize(const +// ConvolutionContext& ctx) const + +bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx) const +{ +#if WORKAROUND_SWDEV_229277_227616_229195 + if(!IsHccCompiler()) + return false; +#endif + bool is_applicable = true; + + if(!ctx.direction.IsBackwardData()) + return false; + + if(!ctx.Is2d()) + return false; + + if(!ctx.IsFp32()) + return false; + + if(ctx.group_counts != 1) + return false; + + if(!IsApplicableXdlops(ctx)) + return false; + + int gemm_m = 0; + int gemm_n = 0; + + std::tie(gemm_m, gemm_n, std::ignore) = CalculateGemmSize(ctx, 0); + + is_applicable = is_applicable && gemm_m % 32 == 0 && gemm_n % 32 == 0; + + for(int gemm_id = 0; gemm_id < CalculateNumberOfGemm(ctx); ++gemm_id) + { + int gemm_k = 0; + + std::tie(std::ignore, std::ignore, gemm_k) = CalculateGemmSize(ctx, gemm_id); + + is_applicable = is_applicable && gemm_k % 4 == 0; + } + + return is_applicable; +} + +PerformanceImplicitGemmBwdDataV4R1Xdlops +ConvHipImplicitGemmBwdDataV4R1Xdlops::GetPerformanceConfig(const ConvolutionContext& ctx) const +{ + return GetPerformanceConfigBase(ctx); +} + +bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig( + const ConvolutionContext& ctx, const PerformanceImplicitGemmBwdDataV4R1Xdlops& c) const +{ + MIOPEN_LOG_I(""); + return c.IsValidValue() && c.IsValid(ctx); +} +PerformanceImplicitGemmBwdDataV4R1Xdlops +ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx) const +{ + + // \todo add fp16 and bfp16 kernels + return GenericSearchBwd(*this, ctx); + + // fp16/bfp16 uses fp32 workspace to leverage fp32 atomic add + // if(ctx.IsFp16() || ctx.IsBfp16()) + // return GenericSearchBwd(*this, ctx, SearchTweak::WorkspaceInsteadOfXBuffer); + // else + // return GenericSearchBwd(*this, ctx); +} + +int ConvHipImplicitGemmBwdDataV4R1Xdlops::RunAndMeasureSolution(miopen::Handle& profile_h, + ConstData_t bot_buf, + Data_t top_buf, + ConstData_t wei_buf, + ConstData_t bias_buf, + const ConvolutionContext&, + const ConvSolution& solution, + float& elapsed_time) const +{ + assert(bias_buf == nullptr); + (void)bias_buf; + +#ifdef NDEBUG + try +#endif + { + + elapsed_time = float(0); + + for(auto& k_info : solution.construction_params) + { + + auto kernel = profile_h.AddKernel("", + "", + k_info.kernel_file, + k_info.kernel_name, + k_info.l_wk, + k_info.g_wk, + k_info.comp_options); + + kernel(bot_buf, wei_buf, top_buf); + + elapsed_time += profile_h.GetKernelTime(); + } + } + +#ifdef NDEBUG + catch(miopen::Exception& ex) + { + MIOPEN_LOG_WE(ex.what()); + return -1; + } +#endif + return 0; +} + +ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( + const ConvolutionContext& ctx, + const PerformanceImplicitGemmBwdDataV4R1Xdlops& config, + bool) const +{ + ConvSolution result; + + assert(config.IsValid(ctx)); + + // a series of kernels + for(std::size_t gemm_id = 0; gemm_id < CalculateNumberOfGemm(ctx); ++gemm_id) + { + KernelInfo construction_parameters; + + int gemm_m = 0; + int gemm_n = 0; + int gemm_k = 0; + + std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx, gemm_id); + + // don't compile or launch an empty gridwise GEMM + if(gemm_k > 0) + { + int grid_size = 0; + + const std::size_t GemmMPerBlock = config.GemmMPerBlock; + const std::size_t GemmNPerBlock = config.GemmNPerBlock; + const std::size_t GemmKPerBlock = config.GemmKPerBlock; + const std::size_t GemmMPerWave = config.GemmMPerWave; + const std::size_t GemmNPerWave = config.GemmNPerWave; + + const std::size_t block_size = + GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * wave_size; + + std::tie(grid_size, std::ignore) = config.CalculateGridSize(ctx); + + construction_parameters.l_wk.push_back(block_size); + construction_parameters.l_wk.push_back(1); + construction_parameters.l_wk.push_back(1); + + construction_parameters.g_wk.push_back(block_size * grid_size); + construction_parameters.g_wk.push_back(1); + construction_parameters.g_wk.push_back(1); + + construction_parameters.kernel_file = + "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp"; + + construction_parameters.kernel_name = + "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw"; + + // TODO: add fp16 calculation by GetWorkspaceSize(ctx); + result.workspce_sz = 0; + + std::size_t GemmABlockCopySrcDataPerRead_GemmM = 1; + std::size_t GemmABlockCopyDstDataPerWrite_GemmM = 1; + std::size_t GemmBBlockCopySrcDataPerRead_GemmN = 1; + std::size_t GemmBBlockCopyDstDataPerWrite_GemmN = 1; + + std::tie(GemmABlockCopySrcDataPerRead_GemmM, + GemmABlockCopyDstDataPerWrite_GemmM, + std::ignore) = config.CalculateGemmABlockCopyPerformanceParameters(ctx); + + std::tie(GemmBBlockCopySrcDataPerRead_GemmN, + GemmBBlockCopyDstDataPerWrite_GemmN, + std::ignore) = config.CalculateGemmBBlockCopyPerformanceParameters(ctx); + + // clang-format off + construction_parameters.comp_options = + std::string(" -std=c++14 ") + + std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(ConvolutionContextInterpreter::GetBatchN(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(ConvolutionContextInterpreter::GetOutputChannelK(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_C=") + std::to_string(ConvolutionContextInterpreter::GetInputChannelC(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_HI=") + std::to_string(ConvolutionContextInterpreter::GetInputHeightHi(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_WI=") + std::to_string(ConvolutionContextInterpreter::GetInputWidthWi(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_HO=") + std::to_string(ConvolutionContextInterpreter::GetOutputHeightHo(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_WO=") + std::to_string(ConvolutionContextInterpreter::GetOutputWidthWo(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_Y=") + std::to_string(ConvolutionContextInterpreter::GetFilterHeightY(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_X=") + std::to_string(ConvolutionContextInterpreter::GetFilterWidthX(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_CONV_STRIDE_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_CONV_STRIDE_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_CONV_DILATION_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_CONV_DILATION_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_IN_LEFT_PAD_H=") + std::to_string(ConvolutionContextInterpreter::GetInputLeftPadH(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_IN_LEFT_PAD_W=") + std::to_string(ConvolutionContextInterpreter::GetInputLeftPadW(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_IN_RIGHT_PAD_H=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_IN_RIGHT_PAD_W=") + std::to_string(ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx)) + + std::string(" -DCK_PARAM_PROBLEM_CONV_GROUP_COUNTS=") + std::to_string(ctx.group_counts) + + std::string(" -DCK_PARAM_TUNABLE_BLOCK_SIZE=") + std::to_string(block_size) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_M_PER_BLOCK=") + std::to_string(GemmMPerBlock) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_N_PER_BLOCK=") + std::to_string(GemmNPerBlock) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) + + std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) + + std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmM) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM ) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.InBlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.InBlockCopyClusterLengths_GemmN) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN ) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) + + std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) + + std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + + std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_ADD=") + (support_amd_buffer_atomic_add(ctx) ? '1' : '0') + + std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) + + std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) + + std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') + + std::string(" -DCK_PARAM_GEMM_ID=") + std::to_string(gemm_id) + + std::string(" -D__HIP_PLATFORM_HCC__=1") + + ctx.general_compile_options; + + result.construction_params.push_back(construction_parameters); + + } + } + return result; +} + +} // namespace solver +} // namespace miopen From 13e52273b030fd854ffac529126e78f0fdeff10f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Apr 2020 12:49:51 +0000 Subject: [PATCH 02/18] add -abseil-string-find-startswith in CMakeList --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b2533e0571..16a517745a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,6 +298,7 @@ include(ClangTidy) enable_clang_tidy( CHECKS * + -abseil-string-find-startswith -android-cloexec-fopen # Yea we shouldn't be using rand() -cert-msc30-c From fe5a8b21ce36a16110dd63b8372fbd9cdadeb139 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 22 Apr 2020 09:25:17 +0800 Subject: [PATCH 03/18] modify caller of ComputeLDSRequiredSize --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 2f8858f71a..dcdbef7e89 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -215,7 +215,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& 1, GemmBBlockCopyThreadSliceLengths_GemmN, GemmABlockCopyThreadSliceLengths_GemmM, - true); + 1); return lds_size <= 64 * 1024; } From 3205ab7ec2c3abb62059cebd69163e8b05e990d2 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 09:41:00 +0800 Subject: [PATCH 04/18] merge develop branch --- src/conv/invokers/impl_gemm.cpp | 205 ++++++++++++++++++ .../miopen/conv/invokers/impl_gemm.hpp | 41 ++++ 2 files changed, 246 insertions(+) create mode 100644 src/conv/invokers/impl_gemm.cpp create mode 100644 src/include/miopen/conv/invokers/impl_gemm.hpp diff --git a/src/conv/invokers/impl_gemm.cpp b/src/conv/invokers/impl_gemm.cpp new file mode 100644 index 0000000000..4b2ddc8ddc --- /dev/null +++ b/src/conv/invokers/impl_gemm.cpp @@ -0,0 +1,205 @@ +#include + +#include +#include +#include +#include + +#include + +namespace miopen { +namespace conv { + +InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx) +{ + if(ctx.direction.IsForward()) + { + return [](const std::vector& kernels) { + return [=](Handle& handle, const boost::any& primitive_parameters) { + const auto data_ctx = boost::any_cast(primitive_parameters); + const auto& tensors = data_ctx.tensors; + handle.Run(kernels[0])(tensors.in, tensors.w, tensors.out); + }; + }; + } + else + { + const auto& conv = ctx.conv_problem.GetConv(); + const auto& lowp_quant = conv.lowp_quant; + + return [conv, lowp_quant](const std::vector& kernels) { + return [=](Handle& handle, const boost::any& primitive_parameters) { + const auto data_ctx = boost::any_cast(primitive_parameters); + const auto& tensors = data_ctx.tensors; + const auto& workSpace = data_ctx.workSpace; + + // Miminum checks. Only check what is required to select + // proper invocation procedure & workspace sanity. + auto kernel = handle.Run(kernels[0]); + + float elapsed = 0; + // clang-format off + if((tensors.outDesc.GetType() == miopenHalf || + tensors.outDesc.GetType() == miopenBFloat16) && + (kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw")) + // clang-format on + { + float zero = 0.f; + TensorDescriptor workspaceDesc( + miopenFloat, tensors.outDesc.GetLengths(), tensors.outDesc.GetStrides()); + SetTensor(handle, workspaceDesc, workSpace, &zero); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + + kernel(tensors.in, tensors.w, workSpace); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + + CastTensor(handle, + &lowp_quant, + workspaceDesc, + workSpace, + tensors.outDesc, + tensors.out, + 0, + 0); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + // clang-format off + else if((kernel.GetName() == "gridwise_convolution_implicit_gemm_v4_nchw_kc1x1_nkhw_lds_double_buffer") || + (kernel.GetName() == "gridwise_convolution_implicit_gemm_v4r4_xdlops_nchw_kc1x1_nkhw_lds_double_buffer")) + // clang-format on + { + bool hasStride = + (tensors.inDesc.GetLengths()[2] != tensors.outDesc.GetLengths()[2]) || + (tensors.inDesc.GetLengths()[3] != tensors.outDesc.GetLengths()[3]); + /// \todo set zero within implicitGEMM kernel + if(hasStride) + { + MIOPEN_LOG_I2("hasStride, call SetTensor with zero"); + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + + kernel(tensors.in, tensors.w, tensors.out); + + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + // clang-format off + else if(kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_gnchw_gkcyx_gnkhw") + // clang-format on + { + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + + kernel(tensors.in, tensors.w, tensors.out); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + // clang-format off + else if( + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw") + // clang-format on + { + // this kernel accumulate results into input tensor, therefore need to set zero + bool is_1x1_s1 = false; + if(miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; }) && + miopen::all_of(conv.GetConvStrides(), [](auto v) { return v == 1; })) + { + if(tensors.wDesc.GetLengths()[2] == 1 && tensors.wDesc.GetLengths()[3] == 1) + { // filter = 1 + if(tensors.wDesc.GetSize() == 4 || + (tensors.wDesc.GetSize() == 5 && tensors.wDesc.GetLengths()[4] == 1)) + { + is_1x1_s1 = true; + } + } + } + + if(!is_1x1_s1) + { + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + + kernel(tensors.in, tensors.w, tensors.out); + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + // clang-format off + else if( + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw") + // clang-format on + { + // \todo this kernel doesn't always need to set-zero + bool filterGeStride = false; + if(miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; })) + { + if(tensors.wDesc.GetSize() == 4) + { // 2d + if(tensors.wDesc.GetLengths()[2] >= conv.GetConvStrides()[0] && + tensors.wDesc.GetLengths()[3] >= conv.GetConvStrides()[1]) + { + filterGeStride = true; + } + } + else + { // 3d + if(tensors.wDesc.GetLengths()[2] >= conv.GetConvStrides()[0] && + tensors.wDesc.GetLengths()[3] >= conv.GetConvStrides()[1] && + tensors.wDesc.GetLengths()[4] >= conv.GetConvStrides()[2]) + { + filterGeStride = true; + } + } + } + + if(!filterGeStride) + { + float zero = 0.f; + SetTensor(handle, tensors.outDesc, tensors.out, &zero); + + if(handle.IsProfilingEnabled()) + elapsed += handle.GetKernelTime(); + } + + // a group kernels (compiled from same source code) will be launched + for(const auto& k : kernels) + { + handle.Run(k)(tensors.in, tensors.w, tensors.out); + elapsed += handle.GetKernelTime(); + } + } + else + { + MIOPEN_THROW( + "Error running implicit GEMM backward data convolution (none workspace?)"); + } + + if(handle.IsProfilingEnabled()) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed); + } + }; + }; + } +} + +} // namespace conv +} // namespace miopen diff --git a/src/include/miopen/conv/invokers/impl_gemm.hpp b/src/include/miopen/conv/invokers/impl_gemm.hpp new file mode 100644 index 0000000000..5b6dd1aa16 --- /dev/null +++ b/src/include/miopen/conv/invokers/impl_gemm.hpp @@ -0,0 +1,41 @@ +/******************************************************************************* +* +* MIT License +* +* Copyright (c) 2019 Advanced Micro Devices, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in all +* copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#pragma once + +#include +#include +#include + +#include + +namespace miopen { +namespace conv { + +InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx); + +} // namespace conv +} // namespace miopen From c2abbb7669e1fb6c700c637639aedfb825d11e3b Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 09:42:01 +0800 Subject: [PATCH 05/18] merge develop branch 1 --- Jenkinsfile | 4 +- src/binary_cache.cpp | 6 +- src/find_db.cpp | 4 +- .../miopen/conv/problem_description.hpp | 10 + src/include/miopen/execution_context.hpp | 34 ++- src/include/miopen/kern_db.hpp | 61 ++--- src/include/miopen/problem_description.hpp | 3 + src/include/miopen/sqlite_db.hpp | 259 ++++++------------ src/kern_db.cpp | 33 +-- ...r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp | 6 +- ..._v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp | 6 +- ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp | 8 +- ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp | 12 +- ...bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp | 8 +- .../gridwise_gemm_fp16_bfp16.hpp | 6 +- .../gridwise_gemm_xdlops_fp16_bfp16.hpp | 12 +- .../include/tensor_operation/xdlops_gemm.hpp | 36 ++- .../include/utility/float_type.hpp | 45 +-- ...ps_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp | 14 +- ...dlops_nchw_kcyx_nkhw_lds_double_buffer.cpp | 14 +- src/kernels/float_types.h | 2 +- src/ocl/gcn_asm_utils.cpp | 28 +- src/ocl/utilocl.cpp | 104 +++---- .../conv_hip_implicit_gemm_bwd_data_v1r1.cpp | 10 +- ...hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp | 10 +- .../conv_hip_implicit_gemm_bwd_data_v4r1.cpp | 16 +- src/solver/conv_hip_implicit_gemm_v4.cpp | 17 +- src/solver/conv_hip_implicit_gemm_v4r1.cpp | 18 +- src/solver/conv_hip_implicit_gemm_v4r4.cpp | 10 +- ...conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp | 14 +- ...implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp | 13 +- .../conv_hip_implicit_gemm_v4r4_xdlops.cpp | 28 +- src/solver/conv_ocl_dir2D_bwdWrW_2.cpp | 2 + src/sqlite_db.cpp | 218 ++++++++++++++- test/find_db.cpp | 1 + test/sqlite_perfdb.cpp | 66 ++--- 36 files changed, 655 insertions(+), 483 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 27379571cf..c0851e48bb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -250,7 +250,7 @@ pipeline { } } - stage('Hip clang release') { + stage('Hip clang debug') { agent{ label rocmnode("vega") } environment{ cmd = """ @@ -258,7 +258,7 @@ pipeline { rm -rf build mkdir build cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ diff --git a/src/binary_cache.cpp b/src/binary_cache.cpp index 9fef73ae0b..5e48baa449 100644 --- a/src/binary_cache.cpp +++ b/src/binary_cache.cpp @@ -60,9 +60,9 @@ boost::filesystem::path ComputeUserCachePath() #ifdef MIOPEN_CACHE_DIR std::string cache_dir = MIOPEN_CACHE_DIR; - std::string version = std::to_string(MIOPEN_VERSION_MAJOR) + "." + - std::to_string(MIOPEN_VERSION_MINOR) + "." + - std::to_string(MIOPEN_VERSION_PATCH); + std::string version = + std::to_string(MIOPEN_VERSION_MAJOR) + "." + std::to_string(MIOPEN_VERSION_MINOR) + "." + + std::to_string(MIOPEN_VERSION_PATCH) + "." + MIOPEN_STRINGIZE(MIOPEN_VERSION_TWEAK); auto p = boost::filesystem::path{miopen::ExpandUser(cache_dir)} / version; if(!boost::filesystem::exists(p)) diff --git a/src/find_db.cpp b/src/find_db.cpp index 68664ed0ec..936ad8f06e 100644 --- a/src/find_db.cpp +++ b/src/find_db.cpp @@ -63,7 +63,9 @@ bool CheckInvokerSupport(const std::string& algo) algo == "miopenConvolutionBwdDataAlgoDirect" || algo == "miopenConvolutionBwdWeightsAlgoDirect" || algo == "miopenConvolutionFwdAlgoWinograd" || - algo == "miopenConvolutionBwdDataAlgoWinograd"; + algo == "miopenConvolutionBwdDataAlgoWinograd" || + algo == "miopenConvolutionFwdAlgoImplicitGEMM" || + algo == "miopenConvolutionBwdDataAlgoImplicitGEMM"; } template diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 33c61cd206..2cbe15e5d2 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -224,6 +224,7 @@ struct ProblemDescription const TensorDescriptor& GetIn() const { return in; } const TensorDescriptor& GetWeights() const { return weights; } const TensorDescriptor& GetOut() const { return out; } + const ConvolutionDescriptor& GetConv() const { return conv; } Direction GetDirection() const { return direction; } int GetBias() const { return bias; } @@ -235,6 +236,15 @@ struct ProblemDescription std::size_t GetBackwardPadW() const { return GetWeightsWidth() - GetPadW() - 1; } std::size_t GetBackwardPadH() const { return GetWeightsHeight() - GetPadW() - 1; } + bool IsAsymmetricPadH() const + { + return conv.paddingMode == miopenPaddingSame && (GetWeightsHeight() % 2) == 0; + } + bool IsAsymmetricPadW() const + { + return conv.paddingMode == miopenPaddingSame && (GetWeightsWidth() % 2) == 0; + } + bool Is2d() const { return GetSpatialDims() == 2; } bool IsFp32() const diff --git a/src/include/miopen/execution_context.hpp b/src/include/miopen/execution_context.hpp index b695f5b300..0e7374fc76 100644 --- a/src/include/miopen/execution_context.hpp +++ b/src/include/miopen/execution_context.hpp @@ -28,6 +28,7 @@ #include #include +#include #include @@ -83,32 +84,39 @@ struct ExecutionContext std::string GetPerfDbPath() const { - // clang-format off - return GetSystemDbPath() + boost::filesystem::path pdb_path(GetSystemDbPath()); + std::ostringstream filename; +// clang-format off #if MIOPEN_ENABLE_SQLITE - + "/miopen.db"; + filename << "miopen.db"; #else - + "/" - + GetStream().GetDbBasename() - + ".cd.pdb.txt"; + filename << GetStream().GetDbBasename() + << ".cd.pdb.txt"; #endif // clang-format on + return (pdb_path / filename.str()).string(); } std::string GetUserPerfDbPath() const { + // an empty user-db path indicates user intent to disable + // the database. Default in when dev builds are on // clang-format off - return GetUserDbPath() + const auto& udb = GetUserDbPath(); + if(udb.empty()) + return ""; + boost::filesystem::path pdb_path(udb); + std::ostringstream filename; #if MIOPEN_ENABLE_SQLITE - + "/miopen.udb"; + filename << "miopen_" << SQLitePerfDb::MIOPEN_PERFDB_SCHEMA_VER << ".udb"; #else - + "/" - + GetStream().GetDbBasename() - + "." - + GetUserDbSuffix() - + ".cd.updb.txt"; + filename << GetStream().GetDbBasename() + << "." + << GetUserDbSuffix() + << ".cd.updb.txt"; #endif // clang-format on + return (pdb_path / filename.str()).string(); } private: diff --git a/src/include/miopen/kern_db.hpp b/src/include/miopen/kern_db.hpp index a471ae8aef..d176c59aa2 100644 --- a/src/include/miopen/kern_db.hpp +++ b/src/include/miopen/kern_db.hpp @@ -105,13 +105,13 @@ class KernDb : public SQLiteBase return true; auto del_query = "DELETE FROM " + T::table_name() + " WHERE " + problem_config.Where() + ";"; - sqlite3_stmt_ptr pStmt = Prepare(del_query); - auto rc = SQLRety([&]() { return sqlite3_step(pStmt.get()); }); + auto stmt = SQLite::Statement{sql, del_query}; + auto rc = stmt.Step(sql); if(rc == SQLITE_DONE) return true; else { - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); return false; } } @@ -124,18 +124,15 @@ class KernDb : public SQLiteBase // Where clause with inserted values defeats the purpose of a prepraed statement auto select_query = "SELECT kernel_blob, kernel_hash, uncompressed_size FROM " + T::table_name() + " WHERE " + problem_config.Where() + ";"; - sqlite3_stmt_ptr pStmt = Prepare(select_query); + auto stmt = SQLite::Statement{sql, select_query}; // only one result field // assert one row - auto rc = SQLRety([&]() { return sqlite3_step(pStmt.get()); }); + auto rc = stmt.Step(sql); if(rc == SQLITE_ROW) { - auto ptr = sqlite3_column_blob(pStmt.get(), 0); - auto sz = sqlite3_column_bytes(pStmt.get(), 0); - std::string compressed_blob(reinterpret_cast(ptr), sz); - std::string md5_hash(reinterpret_cast(sqlite3_column_text(pStmt.get(), 1)), - sqlite3_column_bytes(pStmt.get(), 1)); - auto uncompressed_size = sqlite3_column_int64(pStmt.get(), 2); + auto compressed_blob = stmt.ColumnBlob(0); + auto md5_hash = stmt.ColumnText(1); + auto uncompressed_size = stmt.ColumnInt64(2); std::string& decompressed_blob = compressed_blob; if(uncompressed_size != 0) { @@ -149,7 +146,7 @@ class KernDb : public SQLiteBase else if(rc == SQLITE_DONE) return boost::none; else - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); return boost::none; } @@ -165,44 +162,24 @@ class KernDb : public SQLiteBase auto uncompressed_size = problem_config.kernel_blob.size(); bool success = false; auto compressed_blob = compress_fn(problem_config.kernel_blob, &success); - sqlite3_stmt_ptr pStmt = Prepare(insert_query); - sqlite3_bind_text(pStmt.get(), - 1, - problem_config.kernel_name.data(), - problem_config.kernel_name.size(), - SQLITE_TRANSIENT); // NOLINT - sqlite3_bind_text(pStmt.get(), - 2, - problem_config.kernel_args.data(), - problem_config.kernel_args.size(), - SQLITE_TRANSIENT); // NOLINT + auto stmt = SQLite::Statement{sql, insert_query}; + stmt.BindText(1, problem_config.kernel_name); + stmt.BindText(2, problem_config.kernel_args); if(!success) { - sqlite3_bind_blob(pStmt.get(), - 3, - problem_config.kernel_blob.data(), - problem_config.kernel_blob.size(), - SQLITE_TRANSIENT); // NOLINT - sqlite3_bind_int64(pStmt.get(), 5, 0); + stmt.BindBlob(3, problem_config.kernel_blob); + stmt.BindInt64(5, 0); } else { - sqlite3_bind_blob(pStmt.get(), - 3, - compressed_blob.data(), - compressed_blob.size(), - SQLITE_TRANSIENT); // NOLINT - sqlite3_bind_int64(pStmt.get(), 5, uncompressed_size); + stmt.BindBlob(3, compressed_blob); + stmt.BindInt64(5, uncompressed_size); } - sqlite3_bind_text(pStmt.get(), - 4, - md5_sum.data(), - md5_sum.size(), - SQLITE_TRANSIENT); // NOLINT + stmt.BindText(4, md5_sum); - auto rc = SQLRety([&]() { return sqlite3_step(pStmt.get()); }); + auto rc = stmt.Step(sql); if(rc != SQLITE_DONE) - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); return problem_config.kernel_blob; } }; diff --git a/src/include/miopen/problem_description.hpp b/src/include/miopen/problem_description.hpp index ea51887176..f6bf8d9766 100644 --- a/src/include/miopen/problem_description.hpp +++ b/src/include/miopen/problem_description.hpp @@ -174,6 +174,9 @@ struct ProblemDescription int GetBackwardPadW() const { return kernel_size_w - pad_w - 1; } int GetBackwardPadH() const { return kernel_size_h - pad_h - 1; } + bool IsAsymmetricPadH() const { return conv_problem.IsAsymmetricPadH(); } + bool IsAsymmetricPadW() const { return conv_problem.IsAsymmetricPadW(); } + bool Is2d() const { return spatial_dims == 2; } bool Is3d() const { return spatial_dims == 3; } diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index 107435e986..3c80254087 100644 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -155,22 +155,58 @@ struct SQLiteSerializable } }; +class SQLite +{ + class impl; + // do we need propagate const + std::unique_ptr pImpl; + + public: + class Statement + { + class impl; + std::unique_ptr pImpl; + + public: + Statement(const SQLite& sql, const std::string& query); + Statement(const SQLite& sql, + const std::string& query, + const std::vector& vals); + Statement(); + ~Statement(); + Statement(Statement&&) noexcept; + Statement& operator=(Statement&&) noexcept; + Statement& operator=(const Statement&) = delete; + int Step(const SQLite& sql); + std::string ColumnText(int idx); + std::string ColumnBlob(int idx); + int64_t ColumnInt64(int idx); + int BindText(int idx, const std::string& txt); + int BindBlob(int idx, const std::string& blob); + int BindInt64(int idx, int64_t); + }; + + using result_type = std::vector>; + SQLite(); + SQLite(const std::string& filename_, bool is_system); + ~SQLite(); + SQLite(SQLite&&) noexcept; + SQLite& operator=(SQLite&&) noexcept; + SQLite& operator=(const SQLite&) = delete; + bool Valid() const; + result_type Exec(const std::string& query) const; + int Changes() const; + int Retry(std::function) const; + static int Retry(std::function f, std::string filename); + std::string ErrorMessage() const; +}; + template class SQLiteBase { protected: - struct SQLiteCloser - { - void operator()(sqlite3* ptr) - { - std::string filename_(sqlite3_db_filename(ptr, "main")); - SQLiteBase::SQLRety([&]() { return sqlite3_close(ptr); }, filename_); - } - }; - using sqlite3_ptr = std::unique_ptr; - using exclusive_lock = boost::unique_lock; - using shared_lock = boost::shared_lock; - using sqlite3_stmt_ptr = MIOPEN_MANAGE_PTR(sqlite3_stmt*, sqlite3_finalize); + using exclusive_lock = boost::unique_lock; + using shared_lock = boost::shared_lock; static boost::system_time GetLockTimeout() { return boost::get_system_time() + boost::posix_time::milliseconds(60000); @@ -199,6 +235,11 @@ class SQLiteBase { auto file = boost::filesystem::path(filename_); const auto directory = file.remove_filename(); + if(directory.string().empty()) + { + dbInvalid = true; + return; + } if(!(boost::filesystem::exists(directory))) { @@ -208,26 +249,15 @@ class SQLiteBase boost::filesystem::permissions(directory, boost::filesystem::all_all); } } - sqlite3* ptr_tmp; - int rc = 0; - if(is_system) - rc = sqlite3_open_v2(filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READONLY, nullptr); - else - rc = sqlite3_open_v2( - filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, nullptr); - ptrDb = sqlite3_ptr{ptr_tmp}; - if(rc != 0) + sql = std::move(SQLite{filename_, is_system}); + if(!sql.Valid()) { dbInvalid = true; if(!is_system) - { MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_); - } else - { MIOPEN_LOG_W("Unable to read system database file:" + filename_ + " Performance may degrade"); - } } else dbInvalid = false; @@ -236,27 +266,16 @@ class SQLiteBase static Derived& GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu); // TODO: Fix this for the overhead of having fields per record - using SQLRes_t = std::vector>; - - static int find_callback(void* _res, int argc, char** argv, char** azColName) - { - SQLRes_t* res = static_cast(_res); - std::unordered_map record; - for(auto i = 0; i < argc; i++) - record[azColName[i]] = (argv[i] != nullptr) ? argv[i] : "NULL"; - res->push_back(record); - return 0; - } inline auto CheckTableColumns(const std::string& tableName, const std::vector& goldenList) const { const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");"; - SQLRes_t cfg_res; + SQLite::result_type cfg_res; { const auto lock = shared_lock(lock_file, GetLockTimeout()); MIOPEN_VALIDATE_LOCK(lock); - SQLExec(sql_cfg_fds, cfg_res); + cfg_res = sql.Exec(sql_cfg_fds); } std::vector cfg_fds(cfg_res.size()); std::transform( @@ -277,107 +296,6 @@ class SQLiteBase return AllFound; } - inline auto SQLExec(const std::string& query) - { - MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query); - { - auto rc = SQLRety([&]() { - return sqlite3_exec(ptrDb.get(), query.c_str(), find_callback, nullptr, nullptr); - }); - if(rc != SQLITE_OK) - { - MIOPEN_LOG_I2(query); - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - sqlite3_close(ptrDb.get()); - return false; - } - } - return true; - } - inline auto SQLExec(const std::string& query, SQLRes_t& res) const - { - res.clear(); - MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query); - { - auto rc = SQLRety([&]() { - return sqlite3_exec( - ptrDb.get(), query.c_str(), find_callback, static_cast(&res), nullptr); - }); - if(rc != SQLITE_OK) - { - MIOPEN_LOG_I2(query); - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - sqlite3_close(ptrDb.get()); - return false; - } - } - return true; - } - - template - inline int SQLRety(F f) const - { - return SQLiteBase::SQLRety(f, filename); - } - - template - static inline int SQLRety(F f, std::string filename) - { - auto timeout_end = std::chrono::high_resolution_clock::now() + - std::chrono::seconds(30); // TODO: make configurable - auto tries = 0; - while(true) - { - int rc = f(); - if(rc == SQLITE_BUSY) - { - MIOPEN_LOG_I2("Database" + filename + " busy, retrying ..."); - ++tries; - if(tries > 50) - std::this_thread::sleep_for(std::chrono::microseconds(100)); - else - std::this_thread::yield(); - } - else - return rc; - if(std::chrono::high_resolution_clock::now() > timeout_end) - MIOPEN_THROW("Timeout while waiting for Database: " + filename); - } - } - - inline std::string SQLErrorMessage() const - { - std::string errMsg = "Internal error while accessing SQLite database: "; - return errMsg + sqlite3_errmsg(ptrDb.get()); - } - - auto Prepare(const std::string& query) const - { - sqlite3_stmt* ptr = nullptr; - MIOPEN_LOG_I2(query); - auto rc = sqlite3_prepare_v2(ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr); - if(rc != SQLITE_OK) - { - std::string err_msg = "SQLite prepare error: "; - MIOPEN_THROW(miopenStatusInternalError, err_msg + sqlite3_errmsg(ptrDb.get())); - } - return sqlite3_stmt_ptr{ptr}; - } - auto PrepareAndBind(const std::string& query, std::vector& values) const - { - auto stmt = Prepare(query); - int cnt = 1; - for(auto& kinder : values) - { - auto rc = sqlite3_bind_text( - stmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT - if(rc != SQLITE_OK) - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - } - MIOPEN_LOG_I2("[" << JoinStrings(values, ",") << "]"); - return stmt; - } - template inline auto FindRecord(U&... args) { @@ -426,13 +344,12 @@ class SQLiteBase return reinterpret_cast(this)->LoadUnsafe(args...); } - protected: std::string filename; std::string arch; size_t num_cu; LockFile& lock_file; - sqlite3_ptr ptrDb = nullptr; bool dbInvalid; + SQLite sql; }; template @@ -457,6 +374,7 @@ Derived& SQLiteBase::GetCached(const std::string& path, class SQLitePerfDb : public SQLiteBase { public: + static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0"; SQLitePerfDb(const std::string& filename_, bool is_system, const std::string& arch_, @@ -468,12 +386,12 @@ class SQLitePerfDb : public SQLiteBase std::string clause; std::vector vals; std::tie(clause, vals) = prob_desc.InsertQuery(); - auto stmt = PrepareAndBind(clause, vals); - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto stmt = SQLite::Statement{sql, clause, vals}; + auto rc = stmt.Step(sql); if(rc != SQLITE_DONE) MIOPEN_THROW(miopenStatusInternalError, - "Failed to insert config: " + SQLErrorMessage()); - auto cnt = sqlite3_changes(ptrDb.get()); + "Failed to insert config: " + sql.ErrorMessage()); + auto cnt = sql.Changes(); MIOPEN_LOG_I2(cnt << " rows updated"); } template @@ -483,20 +401,16 @@ class SQLitePerfDb : public SQLiteBase std::vector vals; std::tie(clause, vals) = prob_desc.WhereClause(); auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );"; - auto stmt = PrepareAndBind(query, vals); + auto stmt = SQLite::Statement{sql, query, vals}; while(true) { - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto rc = stmt.Step(sql); if(rc == SQLITE_ROW) - { - auto id = - std::string(reinterpret_cast(sqlite3_column_text(stmt.get(), 0))); - return id; - } + return stmt.ColumnText(0); else if(rc == SQLITE_DONE) return ""; else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); } } template @@ -518,22 +432,17 @@ class SQLitePerfDb : public SQLiteBase "AND (arch = '" + arch + "' ) " "AND (num_cu = '" + std::to_string(num_cu) + "');"; // clang-format on - auto stmt = PrepareAndBind(select_query, values); + auto stmt = SQLite::Statement{sql, select_query, values}; DbRecord rec; while(true) { - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto rc = stmt.Step(sql); if(rc == SQLITE_ROW) - { - auto c_slvr = sqlite3_column_text(stmt.get(), 0); - auto c_params = sqlite3_column_text(stmt.get(), 1); - rec.SetValues(std::string(reinterpret_cast(c_slvr)), - std::string(reinterpret_cast(c_params))); - } + rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1)); else if(rc == SQLITE_DONE) break; else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); } if(rec.GetSize() == 0) return boost::none; @@ -561,14 +470,14 @@ class SQLitePerfDb : public SQLiteBase + clause + " ) )" "AND solver == '" + id + "' ;"; // clang-format on - auto stmt = PrepareAndBind(query, values); - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto stmt = SQLite::Statement{sql, query, values}; + auto rc = stmt.Step(sql); if(rc == SQLITE_DONE) return true; else { std::string msg = "Unable to remove database entry: "; - MIOPEN_LOG_E(msg + SQLErrorMessage()); + MIOPEN_LOG_E(msg + sql.ErrorMessage()); return false; } } @@ -586,12 +495,12 @@ class SQLitePerfDb : public SQLiteBase std::string clause; std::vector vals; std::tie(clause, vals) = problem_config.InsertQuery(); - auto stmt = PrepareAndBind(clause, vals); - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto stmt = SQLite::Statement{sql, clause, vals}; + auto rc = stmt.Step(sql); if(rc != SQLITE_DONE) MIOPEN_THROW(miopenStatusInternalError, - "Failed to insert config: " + SQLErrorMessage()); - auto cnt = sqlite3_changes(ptrDb.get()); + "Failed to insert config: " + sql.ErrorMessage()); + auto cnt = sql.Changes(); MIOPEN_LOG_I2(cnt << " rows updated"); } @@ -615,12 +524,12 @@ class SQLitePerfDb : public SQLiteBase vals.push_back(params.str()); vals.push_back(arch); vals.push_back(std::to_string(num_cu)); - auto stmt = PrepareAndBind(query, vals); - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto stmt = SQLite::Statement{sql, query, vals}; + auto rc = stmt.Step(sql); if(rc != SQLITE_DONE) { MIOPEN_LOG_E("Failed to insert performance record in the database: " + - SQLErrorMessage()); + sql.ErrorMessage()); return boost::none; } } @@ -655,11 +564,11 @@ class SQLitePerfDb : public SQLiteBase "SELECT id FROM config WHERE ( " + clause + " ))"; // clang-format on - auto stmt = PrepareAndBind(query, values); - auto rc = SQLRety([&]() { return sqlite3_step(stmt.get()); }); + auto stmt = SQLite::Statement{sql, query, values}; + auto rc = stmt.Step(sql); if(rc != SQLITE_DONE) { - MIOPEN_LOG_E("Unable to Clear databaes entry: " + SQLErrorMessage()); + MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage()); return false; } else diff --git a/src/kern_db.cpp b/src/kern_db.cpp index 07faf5998a..b991e59574 100644 --- a/src/kern_db.cpp +++ b/src/kern_db.cpp @@ -44,32 +44,29 @@ KernDb::KernDb(const std::string& filename_, compress_fn(_compress_fn), decompress_fn(_decompress_fn) { + if(dbInvalid) + { + if(filename.empty()) + MIOPEN_LOG_I("database not present"); + else + MIOPEN_LOG_I(filename + " database invalid"); + return; + } if(!is_system) { const auto lock = exclusive_lock(lock_file, GetLockTimeout()); MIOPEN_VALIDATE_LOCK(lock); const std::string create_table = KernelConfig::CreateQuery(); - if(!SQLExec(create_table)) - MIOPEN_THROW(miopenStatusInternalError); + sql.Exec(create_table); MIOPEN_LOG_I2("Database created successfully"); } - if(!dbInvalid) + if(!CheckTableColumns(KernelConfig::table_name(), KernelConfig::FieldNames())) { - if(!CheckTableColumns(KernelConfig::table_name(), KernelConfig::FieldNames())) - { - std::ostringstream ss; - ss << "Invalid fields in table: " << KernelConfig::table_name() - << " disabling access to " << filename; - MIOPEN_LOG_W(ss.str()); - dbInvalid = true; - } - } - else - { - if(filename.empty()) - MIOPEN_LOG_I("database not present"); - else - MIOPEN_LOG_I(filename + " database invalid"); + std::ostringstream ss; + ss << "Invalid fields in table: " << KernelConfig::table_name() << " disabling access to " + << filename; + MIOPEN_LOG_W(ss.str()); + dbInvalid = true; } } diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp index ff5f5e0704..614c129b35 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_gnchw_gkcyx_gnkhw.hpp @@ -155,11 +155,11 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_xdlops_fp16_bfp16_gnchw_ GridwiseBatchedGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1< GridSize, BlockSize, - Float, // Input data type = half (fp16) or ushort (bfp16) + Float, // Input data type = fp16 (fp16) or ushort (bfp16) AccFloat, // Acc data type = float - AccFloat, // Output data type = float (not half/ushort as this kernel uses atomic + AccFloat, // Output data type = float (not fp16/ushort as this kernel uses atomic // add. - // No ISA for half/ushort atomic add) + // No ISA for fp16/ushort atomic add) decltype(wei_gemmg_gemmk_gemmm_gemmkpack_global_desc), decltype(out_gemmg_gemmk_gemmn_gemmkpack_global_desc), decltype(in_gemmg_gemmm_gemmn_global_desc), diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp index 5f2e1422df..5a59bda3a2 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_fp16_bfp16_nchw_kcyx_nkhw.hpp @@ -142,10 +142,10 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_xdlops_f16_bfp16_nchw_kc constexpr auto gridwise_gemm = GridwiseGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1< GridSize, BlockSize, - Float, // Input data type = half (fp16) or ushort (bfp16) + Float, // Input data type = fp16 (fp16) or ushort (bfp16) AccFloat, // Acc data type = float - AccFloat, // Output data type = float (not half/ushort as this kernel uses atomic add. - // No ISA for half/ushort atomic add) + AccFloat, // Output data type = float (not fp16/ushort as this kernel uses atomic add. + // No ISA for fp16/ushort atomic add) decltype(wei_gemmk_gemmm_gemmkpack_global_desc), decltype(out_gemmk_gemmn_gemmkpack_global_desc), decltype(in_gemmm_gemmn_global_desc), diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp index 2e78877f88..680fd1a7bf 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -405,12 +405,12 @@ struct GridwiseConvolutionImplicitGemm_v4_fp16_bfp16_nchw_kcyx_nkhw_lds_double_b blockwise_wei_copy.RunLoadThreadBuffer(p_wei_block_on_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with a single value in mind (e.g. float), - // to retain the same 2D indexes for half/bfloat16, we recast datatype - // from a single half to 4 packed half/2 packed bfloat16 respectively. + // to retain the same 2D indexes for fp16/bfloat16, we recast datatype + // from a single fp16 to 4 packed fp16/2 packed bfloat16 respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( p_wei_block_double); diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp index 7c85e48ed7..d76c866d0e 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -425,11 +425,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with vectorized data in mind (e.g. float,half4, short2), - // we recast datatype from a single half/bfloat16 to 4 packed half/2 packed bfloat16 + // we recast datatype from a single fp16/bfloat16 to 4 packed fp16/2 packed bfloat16 // respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( @@ -465,11 +465,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_fp16_bfp16_nchw_kcyx_nkhw_lds_double blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on 2nd-last data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with vectorized data in mind (e.g. float,half4, short2), - // we recast datatype from a single half/bfloat16 to 4 packed half/2 packed bfloat16 + // we recast datatype from a single fp16/bfloat16 to 4 packed fp16/2 packed bfloat16 // respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp index 792127a001..6380731116 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -340,12 +340,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_xdlops_fp16_bfp16_nchw_kcyx_nkhw_lds blockwise_wei_copy.RunLoadThreadBuffer(p_wei_block_on_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with a single value in mind (e.g. float), - // to retain the same 2D indexes for half/bfloat16, we recast datatype - // from a single half to 4 packed half/2 packed bfloat16 respectively. + // to retain the same 2D indexes for fp16/bfloat16, we recast datatype + // from a single fp16 to 4 packed fp16/2 packed bfloat16 respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( p_wei_block_double); diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp index ffa9745ed3..8f1e84ea84 100644 --- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp +++ b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_fp16_bfp16.hpp @@ -269,11 +269,11 @@ struct GridwiseGemmTransposedANormalBNormalCFp16Bfp16_v1 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4), - // we recast datatype from a single half to 4 packed half/2 packed bfloat16 + // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16 // respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( diff --git a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp index 19d010f9b7..e1d989302e 100644 --- a/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp +++ b/src/kernels/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp @@ -252,11 +252,11 @@ struct GridwiseGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4), - // we recast datatype from a single half to 4 packed half/2 packed bfloat16 + // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16 // respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( @@ -601,11 +601,11 @@ struct GridwiseBatchedGemmTransposedANormalBNormalCXdlopsFp16Bfp16_v1 b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); // LDS double buffer: GEMM on current data - // Vectorize the pointer to match with how half/bfloat16 datatypes are - // processed in gemm operation. Half type packs 4 half values while + // Vectorize the pointer to match with how fp16/bfloat16 datatypes are + // processed in gemm operation. fp16 type packs 4 fp16 values while // bfloat16 packs 2 bfloat16 values. Since gemm's matrix A and B // 2D indexes are computed with vectorized value in mind (e.g. float, half2, half4), - // we recast datatype from a single half to 4 packed half/2 packed bfloat16 + // we recast datatype from a single fp16 to 4 packed fp16/2 packed bfloat16 // respectively. const typename vector_type::MemoryType* p_a_block_vec = reinterpret_cast::MemoryType*>( diff --git a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp index 99998617df..3be177cbcd 100644 --- a/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp +++ b/src/kernels/composable_kernel/include/tensor_operation/xdlops_gemm.hpp @@ -5,6 +5,8 @@ #include "ConstantMatrixDescriptor.hpp" #include "math.hpp" +#define WORKAROUND_SWDEV_229564 1 + namespace ck { enum struct mfma_instr @@ -205,7 +207,7 @@ struct mfma_info template __device__ void - run(Number, Number, const half* a, const half* b, float* reg_c) const + run(Number, Number, const half_t* a, const half_t* b, float* reg_c) const { static_assert((MPerWave == 64 && NPerWave == 64) || (MPerWave == 32 && NPerWave == 64) || (MPerWave == 64 && NPerWave == 32), @@ -237,7 +239,7 @@ struct mfma_info template __device__ void - run(Number, Number, const half* a, const half* b, float* reg_c) const + run(Number, Number, const half_t* a, const half_t* b, float* reg_c) const { static_assert((MPerWave == 32 && NPerWave == 32), "unsupported xdlops gemm"); @@ -267,7 +269,7 @@ struct mfma_info template __device__ void - run(Number, Number, const half* a, const half* b, float* reg_c) const + run(Number, Number, const half_t* a, const half_t* b, float* reg_c) const { static_assert((MPerWave == 16 && NPerWave == 16), "unsupported xdlops gemm"); @@ -297,7 +299,7 @@ struct mfma_info template __device__ void - run(Number, Number, const half* a, const half* b, float* reg_c) const + run(Number, Number, const half_t* a, const half_t* b, float* reg_c) const { static_assert((MPerWave == 16 && NPerWave == 64) || (MPerWave == 64 && NPerWave == 16), "unsupported xdlops gemm"); @@ -328,7 +330,7 @@ struct mfma_info template __device__ void - run(Number, Number, const half* a, const half* b, float* reg_c) const + run(Number, Number, const half_t* a, const half_t* b, float* reg_c) const { static_assert((MPerWave == 4 || MPerWave == 8) && NPerWave == 64, "unsupported xdlops gemm"); @@ -616,55 +618,55 @@ struct XdlopsGemm_t } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } template <> - __device__ static constexpr auto GetMFMAInfo() + __device__ static constexpr auto GetMFMAInfo() { return mfma_info{}; } @@ -851,6 +853,9 @@ struct XdlopsGemm_t auto pa = reinterpret_cast(&a); auto pb = reinterpret_cast(&b); +#if WORKAROUND_SWDEV_229564 +#pragma unroll +#endif for(index_t k = 0; k < K; ++k) { constexpr index_t nxdlops = sizeof(FloatA) / (mfma_type.k * sizeof(data_type)); @@ -883,6 +888,9 @@ struct XdlopsGemm_t constexpr index_t nxdlops = (sizeof(FloatA) * mfma_type.num_input_blks) / (mfma_type.k * sizeof(data_type)); +#if WORKAROUND_SWDEV_229564 +#pragma unroll +#endif for(index_t k = 0; k < K; k += mfma_type.num_input_blks) { for(index_t i = 0; i < nxdlops; ++i, pa += mfma_type.k, pb += mfma_type.k) diff --git a/src/kernels/composable_kernel/include/utility/float_type.hpp b/src/kernels/composable_kernel/include/utility/float_type.hpp index 94edeebc3c..fc4094fe12 100644 --- a/src/kernels/composable_kernel/include/utility/float_type.hpp +++ b/src/kernels/composable_kernel/include/utility/float_type.hpp @@ -11,6 +11,7 @@ typedef float float16_t __attribute__((ext_vector_type(16))); typedef float float32_t __attribute__((ext_vector_type(32))); // float16 +typedef _Float16 half_t; typedef _Float16 half2_t __attribute__((ext_vector_type(2))); typedef _Float16 half4_t __attribute__((ext_vector_type(4))); typedef _Float16 half8_t __attribute__((ext_vector_type(8))); @@ -85,37 +86,37 @@ struct vector_type }; template <> -struct vector_type +struct vector_type { - using MemoryType = half; + using MemoryType = half_t; template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number) { static_assert(I < 1, "wrong"); - *(reinterpret_cast(&v) + I) = s; + *(reinterpret_cast(&v) + I) = s; } }; template <> -struct vector_type +struct vector_type { using MemoryType = half2_t; union DataType { MemoryType vector; - half scalar[2]; + half_t scalar[2]; }; template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number) { static_assert(I < 2, "wrong"); - *(reinterpret_cast(&v) + I) = s; + *(reinterpret_cast(&v) + I) = s; } - __host__ __device__ static MemoryType Pack(half s0, half s1) + __host__ __device__ static MemoryType Pack(half_t s0, half_t s1) { DataType data; data.scalar[0] = s0; @@ -125,24 +126,24 @@ struct vector_type }; template <> -struct vector_type +struct vector_type { using MemoryType = half4_t; union DataType { MemoryType vector; - half scalar[4]; + half_t scalar[4]; }; template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number) { static_assert(I < 4, "wrong"); - *(reinterpret_cast(&v) + I) = s; + *(reinterpret_cast(&v) + I) = s; } - __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3) + __host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3) { DataType data; data.scalar[0] = s0; @@ -154,21 +155,21 @@ struct vector_type }; template <> -struct vector_type +struct vector_type { using MemoryType = half8_t; union DataType { MemoryType vector; - half scalar[8]; + half_t scalar[8]; }; template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number) { static_assert(I < 8, "wrong"); - *(reinterpret_cast(&v) + I) = s; + *(reinterpret_cast(&v) + I) = s; } }; @@ -294,8 +295,8 @@ struct inner_product_with_conversion __device__ T operator()(half2_t a, half2_t b) const { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); + const half_t* p_a_half = reinterpret_cast(&a); + const half_t* p_b_half = reinterpret_cast(&b); T acc = 0; for(index_t v = 0; v < 2; ++v) @@ -308,8 +309,8 @@ struct inner_product_with_conversion __device__ T operator()(half4_t a, half4_t b) const { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); + const half_t* p_a_half = reinterpret_cast(&a); + const half_t* p_b_half = reinterpret_cast(&b); T acc = 0; for(index_t v = 0; v < 4; ++v) diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp index 467f14baac..d2f2604fa8 100644 --- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp +++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp @@ -230,15 +230,15 @@ extern "C" __global__ gridwise_conv.Run(p_in_global, p_wei_global, p_out_global); #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2 // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension - // It requires output blob to be of float as no atomic add exists for half/ushort + // It requires output blob to be of float as no atomic add exists for fp16/ushort constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_gnchw_gkcyx_gnkhw_lds_double_buffer< GridSize, BlockSize, - FLOAT, // Input data type = half (fp16) or ushort (bfp16) + FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) FLOAT_ACCUM, // Acc data type = float (see float_types.h) - float, // Output data type = float (not half/ushort) as no atomic add ISA exists for - // half/ushort. + float, // Output data type = float (not fp16/ushort) as no atomic add ISA exists for + // fp16/ushort. decltype(in_gnchw_desc), decltype(wei_gkcyx_desc), decltype(out_gnkhw_desc), @@ -270,7 +270,7 @@ extern "C" __global__ GemmBBlockCopySrcDataPerRead_GemmN, GemmBBlockCopyDstDataPerWrite_GemmKPACK, dir>{}; - // Output blob is cast to float as no atomic add exists for half/ushort + // Output blob is cast to float as no atomic add exists for fp16/ushort gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast(p_out_global)); #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2 // Forward data doesn't use any atomic add so output blob remains of the same type @@ -279,9 +279,9 @@ extern "C" __global__ GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_gnchw_gkcyx_gnkhw_lds_double_buffer< GridSize, BlockSize, - FLOAT, // Input data type = half (fp16) or ushort (bfp16) + FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) FLOAT_ACCUM, // Acc data type = float (see float_types.h) - FLOAT, // Output data type = half (fp16) or ushort (bfp16) + FLOAT, // Output data type = fp16 (fp16) or ushort (bfp16) decltype(in_gnchw_desc), decltype(wei_gkcyx_desc), decltype(out_gnkhw_desc), diff --git a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp index 194295647c..6c7ab7b233 100644 --- a/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp +++ b/src/kernels/composable_kernel/src/kernel_wrapper/gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp @@ -229,15 +229,15 @@ extern "C" __global__ #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2 // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension - // It requires output blob to be of float as no atomic add exists for half/ushort + // It requires output blob to be of float as no atomic add exists for fp16/ushort constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer< GridSize, BlockSize, - FLOAT, // Input data type = half (fp16) or ushort (bfp16) + FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) FLOAT_ACCUM, // Acc data type = float (see float_types.h) - float, // Output data type = float (not half/ushort) as no atomic add ISA exists for - // half/ushort. + float, // Output data type = float (not fp16/ushort) as no atomic add ISA exists for + // fp16/ushort. decltype(in_nchw_desc), decltype(wei_kcyx_desc), decltype(out_nkhw_desc), @@ -270,7 +270,7 @@ extern "C" __global__ GemmBBlockCopyDstDataPerWrite_GemmKPACK, dir>{}; - // Output blob is cast to float as no atomic add exists for half/ushort + // Output blob is cast to float as no atomic add exists for fp16/ushort gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast(p_out_global)); #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2 // Forward data doesn't use any atomic add so output blob remains of the same type @@ -287,9 +287,9 @@ extern "C" __global__ GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer< GridSize, BlockSize, - FLOAT, // Input data type = half (fp16) or ushort (bfp16) + FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) FLOAT_ACCUM, // Acc data type = float (see float_types.h) - FLOAT, // Input data type = half (fp16) or ushort (bfp16) + FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) decltype(in_nchw_desc), decltype(wei_kcyx_desc), decltype(out_nkhw_desc), diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h index 355a9409ba..1074e7ce2b 100644 --- a/src/kernels/float_types.h +++ b/src/kernels/float_types.h @@ -36,7 +36,7 @@ #if MIOPEN_USE_FP16 == 1 #ifdef __HIP_PLATFORM_HCC__ -#define FLOAT half +#define FLOAT _Float16 #define FLOAT_ACCUM float #else #pragma OPENCL EXTENSION cl_khr_fp16 : enable diff --git a/src/ocl/gcn_asm_utils.cpp b/src/ocl/gcn_asm_utils.cpp index d1b2b65df1..c1456f71e6 100644 --- a/src/ocl/gcn_asm_utils.cpp +++ b/src/ocl/gcn_asm_utils.cpp @@ -48,6 +48,12 @@ #include #endif // __linux__ +/// SWDEV-220166: hcc reports unknown target instead of amdgpu but reports "HCC" at least. +#define WORKAROUND_SWDEV_220166 1 +/// SWDEV-233338: hip-clang reports unknown target instead of amdgpu. +/// \todo Try to assemble AMD GCN source? +#define WORKAROUND_SWDEV_233338 1 + MIOPEN_DECLARE_ENV_VAR(MIOPEN_EXPERIMENTAL_GCN_ASM_PATH) static const char option_no_co_v3[] = "-mno-code-object-v3"; @@ -81,10 +87,13 @@ bool ValidateGcnAssemblerImpl() const auto path = GetGcnAssemblerPath(); if(path.empty()) { + MIOPEN_LOG_NQE("Path to assembler is not provided. Expect performance degradation."); return false; } if(!std::ifstream(path).good()) { + MIOPEN_LOG_NQE("Wrong path to assembler: '" << path + << "'. Expect performance degradation."); return false; } @@ -100,23 +109,28 @@ bool ValidateGcnAssemblerImpl() std::string clang_result_line; std::getline(clang_stdout, clang_result_line); MIOPEN_LOG_NQI2(clang_result_line); + +#if WORKAROUND_SWDEV_220166 if(clang_result_line.find("HCC") != std::string::npos) - // Temporary fix for SWDEV-220166 which causes clang to report unknown - // architecture for AMD GCN return true; - else if(clang_result_line.find("clang") != std::string::npos) +#endif + + if(clang_result_line.find("clang") != std::string::npos) { while(!clang_stdout.eof()) { std::getline(clang_stdout, clang_result_line); MIOPEN_LOG_NQI2(clang_result_line); - if(clang_result_line.find("Target: ") != std::string::npos) - { - return clang_result_line.find("amdgcn") != std::string::npos; - } + if(clang_result_line.find("Target: ") != std::string::npos && + clang_result_line.find("amdgcn") != std::string::npos) + return true; } +#if WORKAROUND_SWDEV_233338 + return true; +#endif } #endif // __linux__ + MIOPEN_LOG_NQE("Specified assembler does not support AMDGPU. Expect performance degradation."); return false; } diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp index 21a5ac7643..27d7a76c13 100644 --- a/src/ocl/utilocl.cpp +++ b/src/ocl/utilocl.cpp @@ -62,17 +62,17 @@ float Im2d2ColGPU(Handle& handle, // clang-format off std::string network_config = - "c" + std::to_string(c) + - "i" + std::to_string(in_h) + - "_" + std::to_string(in_w) + - "w" + std::to_string(wei_h) + - "_" + std::to_string(wei_w) + - "p" + std::to_string(pad_h) + - "_" + std::to_string(pad_w) + - "s" + std::to_string(stride_h) + - "_" + std::to_string(stride_w) + - "d" + std::to_string(dilation_h) + - "_" + std::to_string(dilation_w) + + "c" + std::to_string(c) + + "i" + std::to_string(in_h) + + "_" + std::to_string(in_w) + + "w" + std::to_string(wei_h) + + "_" + std::to_string(wei_w) + + "p" + std::to_string(pad_h) + + "_" + std::to_string(pad_w) + + "s" + std::to_string(stride_h) + + "_" + std::to_string(stride_w) + + "d" + std::to_string(dilation_h) + + "_" + std::to_string(dilation_w) + "t" + std::to_string(type); // clang-format on @@ -249,22 +249,22 @@ float Im3d2ColGPU(Handle& handle, // clang-format off std::string network_config = - "c" + std::to_string(im_c) + - "i" + std::to_string(im_d) + - "_" + std::to_string(im_h) + - "_" + std::to_string(im_w) + - "w" + std::to_string(wei_d) + - "_" + std::to_string(wei_h) + - "_" + std::to_string(wei_w) + - "p" + std::to_string(pad_d) + - "_" + std::to_string(pad_h) + - "_" + std::to_string(pad_w) + - "s" + std::to_string(stride_d) + + "c" + std::to_string(im_c) + + "i" + std::to_string(im_d) + + "_" + std::to_string(im_h) + + "_" + std::to_string(im_w) + + "w" + std::to_string(wei_d) + + "_" + std::to_string(wei_h) + + "_" + std::to_string(wei_w) + + "p" + std::to_string(pad_d) + + "_" + std::to_string(pad_h) + + "_" + std::to_string(pad_w) + + "s" + std::to_string(stride_d) + "_" + std::to_string(stride_h) + - "_" + std::to_string(stride_w) + - "d" + std::to_string(dilation_d) + - "_" + std::to_string(dilation_h) + - "_" + std::to_string(dilation_w) + + "_" + std::to_string(stride_w) + + "d" + std::to_string(dilation_d) + + "_" + std::to_string(dilation_h) + + "_" + std::to_string(dilation_w) + "t" + std::to_string(type); // clang-format on @@ -364,17 +364,17 @@ float Col2Im2dGPU(Handle& handle, // clang-format off std::string network_config = - "c" + std::to_string(in_c) + + "c" + std::to_string(in_c) + "in_h" + std::to_string(in_h) + - "in_w" + std::to_string(in_w) + - "y" + std::to_string(wei_h) + - "x" + std::to_string(wei_w) + - "p" + std::to_string(pad_h) + - "q" + std::to_string(pad_w) + - "u" + std::to_string(stride_h) + - "v" + std::to_string(stride_w) + - "l" + std::to_string(dilation_h) + - "j" + std::to_string(dilation_w) + + "in_w" + std::to_string(in_w) + + "y" + std::to_string(wei_h) + + "x" + std::to_string(wei_w) + + "p" + std::to_string(pad_h) + + "q" + std::to_string(pad_w) + + "u" + std::to_string(stride_h) + + "v" + std::to_string(stride_w) + + "l" + std::to_string(dilation_h) + + "j" + std::to_string(dilation_w) + "t" + std::to_string(type); // clang-format on @@ -458,22 +458,22 @@ float Col2Im3dGPU(Handle& handle, // clang-format off std::string network_config = - "c" + std::to_string(in_c) + - "i" + std::to_string(in_d) + - "_" + std::to_string(in_h) + - "_" + std::to_string(in_w) + - "w" + std::to_string(wei_d) + - "_" + std::to_string(wei_h) + - "_" + std::to_string(wei_w) + - "p" + std::to_string(pad_d) + - "_" + std::to_string(pad_h) + - "_" + std::to_string(pad_w) + - "s" + std::to_string(stride_d) + + "c" + std::to_string(in_c) + + "i" + std::to_string(in_d) + + "_" + std::to_string(in_h) + + "_" + std::to_string(in_w) + + "w" + std::to_string(wei_d) + + "_" + std::to_string(wei_h) + + "_" + std::to_string(wei_w) + + "p" + std::to_string(pad_d) + + "_" + std::to_string(pad_h) + + "_" + std::to_string(pad_w) + + "s" + std::to_string(stride_d) + "_" + std::to_string(stride_h) + - "_" + std::to_string(stride_w) + - "d" + std::to_string(dilation_d) + - "_" + std::to_string(dilation_h) + - "_" + std::to_string(dilation_w) + + "_" + std::to_string(stride_w) + + "d" + std::to_string(dilation_d) + + "_" + std::to_string(dilation_h) + + "_" + std::to_string(dilation_w) + "t" + std::to_string(type); // clang-format on @@ -960,7 +960,7 @@ float transpose_NCHW2Vec(Handle& handle, lens.begin() + 2, lens.end(), std::size_t(1), std::multiplies()); // clang-format off - std::string network_config = + std::string network_config = "n" + std::to_string(n) + "c" + std::to_string(c) + "hw" + std::to_string(hw) + diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp index 6078870e4a..523cbf58f1 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp @@ -23,12 +23,15 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include +#include +#include #include + #include "implicitgemm_util.hpp" +#include + namespace miopen { namespace solver { @@ -882,6 +885,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1::GetSolution( std::to_string(GemmBBlockCopyDstDataPerWrite_GemmKPACK); } + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp index 37ca8c685f..4f6e8dabb8 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp @@ -23,12 +23,15 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include +#include +#include #include + #include "implicitgemm_util.hpp" +#include + namespace miopen { namespace solver { @@ -283,6 +286,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution( std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmKPACK); } + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp index f6651060e9..48066d0b23 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp @@ -23,13 +23,16 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include +#include +#include #include + #include "implicitgemm_util.hpp" +#include +#include + namespace miopen { namespace solver { @@ -446,9 +449,9 @@ bool PerformanceImplicitGemmBwdDataV4R1::IsValidValue() const { // clang-format off return IsTwoPower<64, 256>(BlockSize) && - IsTwoPower<32, 128>(GemmMPerBlock) && + IsTwoPower<32, 128>(GemmMPerBlock) && IsTwoPower<32, 128>(GemmNPerBlock) && - IsTwoPower<4, 16>(GemmKPerBlock) && + IsTwoPower<4, 16>(GemmKPerBlock) && IsTwoPower<2, 4>(GemmMPerThread) && IsTwoPower<2, 4>(GemmNPerThread); // clang-format on @@ -956,6 +959,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1::GetSolution( } } + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_v4.cpp b/src/solver/conv_hip_implicit_gemm_v4.cpp index 0b4b6388a6..d33aa7a343 100644 --- a/src/solver/conv_hip_implicit_gemm_v4.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4.cpp @@ -24,13 +24,15 @@ * *******************************************************************************/ -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include + +#include +#include #include -#include "miopen/stringutils.hpp" -#include "implicitgemm_util.hpp" -#include "miopen/implicitgemm_params.hpp" -#include "miopen/hip_build_utils.hpp" +#include +#include +#include + #include "implicitgemm_util.hpp" #define WORKAROUND_ISSUE_2174_2222_2224_2243 1 @@ -357,6 +359,9 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx, ctx.general_compile_options; // clang-format on + if(ctx.direction.IsForward() || ctx.direction.IsBackwardData()) + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); + result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_v4r1.cpp index 166fb0e9b3..7aa377af0f 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r1.cpp @@ -23,12 +23,17 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" + +#include + +#include +#include #include + #include "implicitgemm_util.hpp" +#include + namespace miopen { namespace solver { @@ -293,7 +298,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c const auto InBlockCopyDstDataPerWrite_EPack = !ctx.IsFp32() ? GetEPackLength(ctx, false) : 1; // clang-format off - construction_parameters.comp_options = + construction_parameters.comp_options = std::string(" -std=c++14 ") + std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(n) + std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(k) + @@ -359,6 +364,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c std::to_string(WeiBlockCopyDstDataPerWrite_EPack); } + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); result.construction_params.push_back(construction_parameters); return result; } @@ -492,7 +498,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c InBlockCopySrcDataPerRead_B = ctx.kernel_stride_w > 1 ? 1 : InBlockCopySrcDataPerRead_B; // clang-format off - construction_parameters.comp_options = + construction_parameters.comp_options = std::string(" -std=c++14 ") + std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(n) + std::string(" -DCK_PARAM_PROBLEM_K=") + std::to_string(k) + @@ -535,7 +541,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E=") + std::to_string(config.WeiBlockCopyClusterLengths_E) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K=") + std::to_string(config.WeiBlockCopyClusterLengths_K) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) + - std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) + + std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) + std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx)? '1' : '0') + std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + ctx.general_compile_options; diff --git a/src/solver/conv_hip_implicit_gemm_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_v4r4.cpp index 9b0c4295e1..af7f0f8261 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r4.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r4.cpp @@ -23,12 +23,15 @@ * SOFTWARE. * *******************************************************************************/ -#include -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include +#include +#include #include + #include "implicitgemm_util.hpp" +#include + namespace miopen { namespace solver { @@ -769,6 +772,7 @@ ConvSolution ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& c // clang-format on + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp index 090419b0cf..e8418a60c8 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops.cpp @@ -24,12 +24,15 @@ * *******************************************************************************/ -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include + +#include +#include #include -#include "miopen/stringutils.hpp" +#include +#include + #include "implicitgemm_util.hpp" -#include "miopen/implicitgemm_params.hpp" namespace miopen { namespace solver { @@ -320,6 +323,9 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx, } } + if(ctx.direction.IsForward() || ctx.direction.IsBackwardData()) + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); + result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp index 93a761b2f7..d1d5d21e52 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp @@ -24,13 +24,15 @@ * *******************************************************************************/ -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include +#include #include -#include "miopen/stringutils.hpp" +#include +#include +#include +#include + #include "implicitgemm_util.hpp" -#include "miopen/implicitgemm_params.hpp" -#include namespace miopen { namespace solver { @@ -531,6 +533,7 @@ ConvSolution ConvHipImplicitGemmV4R4GenXdlopsFwdFp32::GetSolution( ctx.general_compile_options; // clang-format on + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp index 20ee060316..a47d4bef06 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r4_xdlops.cpp @@ -24,14 +24,17 @@ * *******************************************************************************/ -#include "miopen/solver.hpp" -#include "miopen/handle.hpp" +#include + +#include +#include #include -#include "miopen/stringutils.hpp" -#include "implicitgemm_util.hpp" -#include "miopen/implicitgemm_params.hpp" +#include +#include #include +#include "implicitgemm_util.hpp" + namespace miopen { namespace solver { @@ -174,7 +177,7 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx, } // clang-format off - construction_parameters.comp_options += + construction_parameters.comp_options += std::string(" -std=c++14 ") + std::string(" -DCK_PARAM_PROBLEM_DIRECTION=") + std::to_string(static_cast(direction)) + std::string(" -DCK_PARAM_PROBLEM_N=") + std::to_string(ctx.batch_sz) + @@ -195,17 +198,20 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx, std::string(" -DCK_PARAM_IN_BLOCK_COPY_CLUSTER_LENGTHS_B=") + std::to_string(config.InBlockCopyClusterLengths_B) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_E=") + std::to_string(config.WeiBlockCopyClusterLengths_E) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_CLUSTER_LENGTHS_K=") + std::to_string(config.WeiBlockCopyClusterLengths_K) + - std::string(" -DCK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B=") + std::to_string(InBlockCopyDataPerAccess_B) + - std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) + - std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(WeiBlockCopyDstDataPerWrite_K) + - std::string(" -DCK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B=") + std::to_string(OutThreadCopyDataPerAccess_B) + - std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, true)) + + std::string(" -DCK_PARAM_IN_BLOCK_COPY_DATA_PER_ACCESS_B=") + std::to_string(InBlockCopyDataPerAccess_B) + + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(WeiBlockCopySrcDataPerRead_E) + + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(WeiBlockCopyDstDataPerWrite_K) + + std::string(" -DCK_PARAM_OUT_THREAD_COPY_DATA_PER_ACCESS_B=") + std::to_string(OutThreadCopyDataPerAccess_B) + + std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, true)) + std::string(" -DCK_USE_AMD_XDLOPS=") + (IsXdlopsSupport(ctx) ? '1' : '0') + std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + (miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? '1' : '0') + std::string(" -DCK_USE_AMD_XDLOPS_EMULATE=") + (miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE{}) ? '1' : '0') + ctx.general_compile_options; // clang-format on + if(ctx.direction.IsForward() || ctx.direction.IsBackwardData()) + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); + result.construction_params.push_back(construction_parameters); return result; } diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp index d71c485de7..a05e9b6396 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp @@ -472,6 +472,8 @@ bool ConvOclBwdWrW2::IsApplicableBase(const ConvolutionContext& p return false; if(!params.Is2d()) return false; + if(params.IsAsymmetricPadH() || params.IsAsymmetricPadW()) + return false; if(!(params.IsFp32() || params.IsFp16() || params.IsBfp16())) return false; diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index 5ad9110f2e..ed2b5c42ee 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -50,19 +50,228 @@ namespace miopen { +class SQLite::impl +{ + struct SQLiteCloser + { + void operator()(sqlite3* ptr) + { + std::string filename_(sqlite3_db_filename(ptr, "main")); + SQLite::Retry([&]() { return sqlite3_close(ptr); }, filename_); + } + }; + + public: + impl(const std::string& filename_, bool is_system) + { + sqlite3* ptr_tmp; + int rc = 0; + if(is_system) + rc = sqlite3_open_v2(filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READONLY, nullptr); + else + rc = sqlite3_open_v2( + filename_.c_str(), &ptr_tmp, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, nullptr); + ptrDb = sqlite3_ptr{ptr_tmp}; + isValid = (rc == 0); + } + + using sqlite3_ptr = std::unique_ptr; + sqlite3_ptr ptrDb = nullptr; + bool isValid; +}; + +static int find_callback(void* _res, int argc, char** argv, char** azColName) +{ + SQLite::result_type* res = static_cast(_res); + std::unordered_map record; + for(auto i = 0; i < argc; i++) + record[azColName[i]] = (argv[i] != nullptr) ? argv[i] : "NULL"; + if(res != nullptr) + res->push_back(record); + return 0; +} + +SQLite::SQLite() : pImpl(nullptr) {} +SQLite::~SQLite() = default; +SQLite::SQLite(SQLite&&) noexcept = default; +SQLite& SQLite::operator=(SQLite&&) noexcept = default; +SQLite::result_type SQLite::Exec(const std::string& query) const +{ + SQLite::result_type res; + MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query); + { + auto rc = Retry([&]() { + return sqlite3_exec(pImpl->ptrDb.get(), + query.c_str(), + find_callback, + static_cast(&res), + nullptr); + }); + if(rc != SQLITE_OK) + { + MIOPEN_LOG_I2(query); + MIOPEN_THROW(miopenStatusInternalError, ErrorMessage()); + } + } + return res; +} + +int SQLite::Retry(std::function f, std::string filename) +{ + auto timeout_end = std::chrono::high_resolution_clock::now() + + std::chrono::seconds(30); // TODO: make configurable + auto tries = 0; + while(true) + { + int rc = f(); + if(rc == SQLITE_BUSY) + { + MIOPEN_LOG_I2("Database" + filename + " busy, retrying ..."); + ++tries; + if(tries > 50) + std::this_thread::sleep_for(std::chrono::microseconds(100)); + else + std::this_thread::yield(); + } + else + return rc; + if(std::chrono::high_resolution_clock::now() > timeout_end) + MIOPEN_THROW("Timeout while waiting for Database: " + filename); + } +} + +int SQLite::Retry(std::function f) const +{ + std::string filename(sqlite3_db_filename(pImpl->ptrDb.get(), "main")); + return SQLite::Retry(f, filename); +} + +int SQLite::Changes() const { return sqlite3_changes(pImpl->ptrDb.get()); } + +std::string SQLite::ErrorMessage() const +{ + std::string errMsg = "Internal error while accessing SQLite database: "; + return errMsg + sqlite3_errmsg(pImpl->ptrDb.get()); +} +bool SQLite::Valid() const { return pImpl->isValid; } + +class SQLite::Statement::impl +{ + using sqlite3_stmt_ptr = MIOPEN_MANAGE_PTR(sqlite3_stmt*, sqlite3_finalize); + sqlite3_stmt_ptr Prepare(const SQLite& sql, const std::string& query) + { + sqlite3_stmt* ptr = nullptr; + MIOPEN_LOG_I2(query); + auto rc = + sqlite3_prepare_v2(sql.pImpl->ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr); + if(rc != SQLITE_OK) + { + std::string err_msg = "SQLite prepare error: "; + MIOPEN_THROW(miopenStatusInternalError, err_msg + sql.ErrorMessage()); + } + return sqlite3_stmt_ptr{ptr}; + } + + public: + impl(const SQLite& sql, const std::string& query) { ptrStmt = Prepare(sql, query); } + impl(const SQLite& sql, const std::string& query, const std::vector& vals) + { + ptrStmt = Prepare(sql, query); + int cnt = 1; + for(auto& kinder : vals) + { + auto rc = sqlite3_bind_text( + ptrStmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT + if(rc != SQLITE_OK) + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); + } + MIOPEN_LOG_I2("[" << JoinStrings(vals, ",") << "]"); + } + + sqlite3_stmt_ptr ptrStmt = nullptr; +}; + +SQLite::SQLite(const std::string& filename_, bool is_system) + : pImpl{std::make_unique(filename_, is_system)} +{ +} + +SQLite::Statement::Statement(const SQLite& sql, const std::string& query) + : pImpl{std::make_unique(sql, query)} +{ +} +SQLite::Statement::Statement(const SQLite& sql, + const std::string& query, + const std::vector& vals) + : pImpl{std::make_unique(sql, query, vals)} +{ +} +SQLite::Statement::~Statement() = default; +SQLite::Statement::Statement() : pImpl{nullptr} {} +SQLite::Statement::Statement(Statement&&) noexcept = default; +SQLite::Statement& SQLite::Statement::operator=(Statement&&) noexcept = default; +int SQLite::Statement::Step(const SQLite& sql) +{ + return sql.Retry([&]() { return sqlite3_step(pImpl->ptrStmt.get()); }); +} +std::string SQLite::Statement::ColumnText(int idx) +{ + size_t bytes = sqlite3_column_bytes(pImpl->ptrStmt.get(), idx); + return std::string{ + reinterpret_cast(sqlite3_column_text(pImpl->ptrStmt.get(), idx)), bytes}; +} + +std::string SQLite::Statement::ColumnBlob(int idx) +{ + auto ptr = sqlite3_column_blob(pImpl->ptrStmt.get(), idx); + auto sz = sqlite3_column_bytes(pImpl->ptrStmt.get(), idx); + return std::string{reinterpret_cast(ptr), static_cast(sz)}; +} +int64_t SQLite::Statement::ColumnInt64(int idx) +{ + return sqlite3_column_int64(pImpl->ptrStmt.get(), idx); +} + +int SQLite::Statement::BindText(int idx, const std::string& txt) +{ + sqlite3_bind_text( + pImpl->ptrStmt.get(), idx, txt.data(), txt.size(), SQLITE_TRANSIENT); // NOLINT + return 0; +} +int SQLite::Statement::BindBlob(int idx, const std::string& blob) +{ + sqlite3_bind_blob( + pImpl->ptrStmt.get(), idx, blob.data(), blob.size(), SQLITE_TRANSIENT); // NOLINT + return 0; +} + +int SQLite::Statement::BindInt64(int idx, const int64_t num) +{ + sqlite3_bind_int64(pImpl->ptrStmt.get(), idx, num); + return 0; +} + SQLitePerfDb::SQLitePerfDb(const std::string& filename_, bool is_system, const std::string& arch_, const std::size_t num_cu_) : SQLiteBase(filename_, is_system, arch_, num_cu_) { + if(dbInvalid) + { + if(filename.empty()) + MIOPEN_LOG_I("database not present"); + else + MIOPEN_LOG_I(filename + " database invalid"); + return; + } ProblemDescription prob_desc{conv::Direction::Forward}; prob_desc.in_data_type = miopenFloat; prob_desc.out_data_type = miopenFloat; prob_desc.weights_data_type = miopenFloat; if(!is_system) { - SQLRes_t res; + SQLite::result_type res; const std::string create_config = prob_desc.CreateQuery(); // clang-format off const std::string create_perfdb_sql = @@ -89,14 +298,13 @@ SQLitePerfDb::SQLitePerfDb(const std::string& filename_, "type = 'table' AND " "(name = 'config' OR name = 'perf_db');"; // clang-format on - SQLExec(check_tables, res); + res = sql.Exec(check_tables); } if(res.empty()) { const auto lock = exclusive_lock(lock_file, GetLockTimeout()); MIOPEN_VALIDATE_LOCK(lock); - if(!SQLExec(create_config + create_perfdb_sql)) - MIOPEN_THROW(miopenStatusInternalError); + sql.Exec(create_config + create_perfdb_sql); MIOPEN_LOG_I2("Database created successfully"); } } @@ -117,7 +325,5 @@ SQLitePerfDb::SQLitePerfDb(const std::string& filename_, dbInvalid = true; } } - else - MIOPEN_LOG_I(filename + " database invalid"); } } // namespace miopen diff --git a/test/find_db.cpp b/test/find_db.cpp index ef216b21e0..b40ccb776a 100644 --- a/test/find_db.cpp +++ b/test/find_db.cpp @@ -203,5 +203,6 @@ struct FindDbTest : test_driver int main(int argc, const char* argv[]) { setenv("MIOPEN_LOG_LEVEL", "6", 1); + setenv("MIOPEN_COMPILE_PARALLEL_LEVEL", "1", 1); test_drive(argc, argv); } diff --git a/test/sqlite_perfdb.cpp b/test/sqlite_perfdb.cpp index 049629d716..7ce588c948 100644 --- a/test/sqlite_perfdb.cpp +++ b/test/sqlite_perfdb.cpp @@ -225,12 +225,16 @@ std::ostream& operator<<(std::ostream& s, const SolverData& td) class DbTest { public: - DbTest() : temp_file("miopen.tests.perfdb") {} + DbTest() + : temp_file("miopen.tests.perfdb"), db_inst{std::string(temp_file), false, "gfx906", 64} + { + } virtual ~DbTest() {} protected: TempFile temp_file; + SQLitePerfDb db_inst; static const std::array, 2>& common_data() { @@ -243,11 +247,10 @@ class DbTest void ClearDb(SQLitePerfDb& db) const { - auto res = db.SQLExec("delete from config; delete from perf_db;"); - EXPECT(res); + db.sql.Exec("delete from config; delete from perf_db;"); } - void ResetDb() const {} + void ResetDb() const { db_inst.sql.Exec("delete from config; delete from perf_db;"); } static const ProblemData& key() { @@ -328,35 +331,25 @@ class SchemaTest : public DbTest public: void Run() const { - SQLitePerfDb db_inst(std::string(temp_file), false, "gfx906", 64); - // check if the config and perf_db tables exist - SQLitePerfDb::SQLRes_t res; - if(db_inst.SQLExec( - // clang-format off - "SELECT name, sql " - "FROM sqlite_master " - "WHERE type='table' " - "AND name = 'config';" - // clang-format on - , - res)) - EXPECT(res.size() == 1); - else - EXPECT(false); - if(db_inst.SQLExec( - // clang-format off - "SELECT name, sql " - "FROM sqlite_master " - "WHERE type='table' " - "AND name = 'perf_db';" - // clang-format on - , - res)) - - EXPECT(res.size() == 1); - else - EXPECT(false); + SQLite::result_type res = db_inst.sql.Exec( + // clang-format off + "SELECT name, sql " + "FROM sqlite_master " + "WHERE type='table' " + "AND name = 'config';" + // clang-format on + ); + EXPECT(res.size() == 1); + res = db_inst.sql.Exec( + // clang-format off + "SELECT name, sql " + "FROM sqlite_master " + "WHERE type='table' " + "AND name = 'perf_db';" + // clang-format on + ); + EXPECT(res.size() == 1); // TODO: check for indices } }; @@ -364,10 +357,9 @@ class SchemaTest : public DbTest class DbFindTest : public DbTest { public: - void Run() const + void Run() { - SQLitePerfDb db_inst(std::string(temp_file), false, "gfx906", 64); - ResetDb(); // redundant + ResetDb(); const ProblemData p; db_inst.InsertConfig(p); @@ -379,11 +371,11 @@ class DbFindTest : public DbTest const SolverData sol; std::ostringstream ss; sol.Serialize(ss); - EXPECT(db_inst.SQLExec( + db_inst.sql.Exec( // clang-formagt off "INSERT INTO perf_db(config, solver, params, arch, num_cu) " "VALUES( " + - id + ", '" + id0() + "', '" + ss.str() + "', 'gfx906', 64);")); + id + ", '" + id0() + "', '" + ss.str() + "', 'gfx906', 64);"); // clang-fromat on auto sol_res = db_inst.FindRecord(p); From d20b9c8f57ca75bd75e7b9c6700556a21ac6ec0d Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 10:23:57 +0800 Subject: [PATCH 06/18] add invoker for v4r1 xdlops bwd igemm path --- src/include/miopen/sqlite_db.hpp | 3 --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index 14c861693d..8d3c61bf4d 100755 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -400,9 +400,6 @@ class SQLiteBase return stmt; } ->>>>>>> 5609daf1df27ee7a7d4a5c90c28ef1dc979ffdfc -======= ->>>>>>> develop template inline auto FindRecord(U&... args) { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index dcdbef7e89..ae79d911f5 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -23,6 +23,7 @@ * SOFTWARE. * *******************************************************************************/ +#include #include #include "miopen/solver.hpp" #include "miopen/handle.hpp" @@ -681,6 +682,7 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( } } + result.invoker_factory = conv::MakeImplGemmDataInvokerFactory(ctx); return result; } From 0bec9d48efe73a136a923388cea5df86b5b423bb Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 10:27:11 +0800 Subject: [PATCH 07/18] merge sqlite hpp 1 --- src/include/miopen/sqlite_db.hpp | 700 ------------------------------- 1 file changed, 700 deletions(-) diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index 8d3c61bf4d..e69de29bb2 100755 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -1,700 +0,0 @@ -/******************************************************************************* -* -* MIT License -* -* Copyright (c) 2019 Advanced Micro Devices, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in all -* copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -* -*******************************************************************************/ -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include "sqlite3.h" -#include -#include - -#include -#include -#include - -namespace boost { -namespace filesystem { -class path; -} // namespace filesystem -} // namespace boost - -namespace miopen { - -#define MIOPEN_VALIDATE_LOCK(lock) \ - do \ - { \ - if(!(lock)) \ - MIOPEN_THROW("Db lock has failed to lock."); \ - } while(false) - -template -struct SQLiteSerializable -{ - std::vector FieldNames() const - { - std::vector names; - Derived::Visit(static_cast(*this), - [&](const std::string& value, const std::string& name) { - std::ignore = value; - names.push_back(name); - }); - Derived::Visit(static_cast(*this), - [&](const int value, const std::string name) { - std::ignore = value; - names.push_back(name); - }); - - return names; - } - std::tuple> WhereClause() const - { - std::vector values; - std::vector clauses; - Derived::Visit(static_cast(*this), - [&](const std::string& value, const std::string& name) { - clauses.push_back("(" + name + " = ? )"); - values.push_back(value); - }); - Derived::Visit(static_cast(*this), - [&](const int value, const std::string name) { - clauses.push_back("(" + name + " = ? )"); - values.push_back(std::to_string(value)); - }); - std::string clause = JoinStrings(clauses, " AND "); - return std::make_tuple(clause, values); - } - std::tuple> InsertQuery() const - { - std::vector int_names, str_names, values; - Derived::Visit(static_cast(*this), - [&](const std::string& value, const std::string& name) { - str_names.push_back(name); - values.push_back(value); - }); - Derived::Visit(static_cast(*this), - [&](const int value, const std::string name) { - int_names.push_back(name); - values.push_back(std::to_string(value)); - }); - std::vector tokens((values.size()), "?"); - ; - - std::string q = "INSERT OR IGNORE INTO " + Derived::table_name() + "( " + - JoinStrings(str_names, ",") + "," + JoinStrings(int_names, ",") + - " ) VALUES( " + JoinStrings(tokens, ",") + ");"; - return std::make_tuple(q, values); - } - std::tuple> SelectQuery() const - { - std::string clauses; - std::vector values; - std::tie(clauses, values) = WhereClause(); - std::string query = "SELECT id FROM " + Derived::table_name() + " WHERE " + clauses + ";"; - return std::make_tuple(query, values); - } - - std::string CreateQuery() const - { - std::vector str_fields; - Derived::Visit(static_cast(*this), - [&](const std::string value, const std::string name) { - std::ignore = value; - str_fields.push_back(name); - }); - std::vector int_fields; - Derived::Visit(static_cast(*this), - [&](const int value, const std::string name) { - std::ignore = value; - int_fields.push_back(name); - }); - std::ostringstream ss; - ss << "CREATE TABLE IF NOT EXISTS `" << Derived::table_name() << "` (" - << "`id` INTEGER PRIMARY KEY ASC"; - for(auto& el : str_fields) - ss << ",`" << el << "` TEXT NOT NULL"; - for(auto& el : int_fields) - ss << ",`" << el << "` INT NOT NULL"; - ss << ");"; - ss << "CREATE UNIQUE INDEX IF NOT EXISTS " - << "`idx_" << Derived::table_name() << "` " - << "ON " << Derived::table_name() << "( " << miopen::JoinStrings(str_fields, ",") << ", " - << miopen::JoinStrings(int_fields, ",") << " );"; - return ss.str(); - } -}; - -class SQLite -{ - class impl; - // do we need propagate const - std::unique_ptr pImpl; - - public: - class Statement - { - class impl; - std::unique_ptr pImpl; - - public: - Statement(const SQLite& sql, const std::string& query); - Statement(const SQLite& sql, - const std::string& query, - const std::vector& vals); - Statement(); - ~Statement(); - Statement(Statement&&) noexcept; - Statement& operator=(Statement&&) noexcept; - Statement& operator=(const Statement&) = delete; - int Step(const SQLite& sql); - std::string ColumnText(int idx); - std::string ColumnBlob(int idx); - int64_t ColumnInt64(int idx); - int BindText(int idx, const std::string& txt); - int BindBlob(int idx, const std::string& blob); - int BindInt64(int idx, int64_t); - }; - - using result_type = std::vector>; - SQLite(); - SQLite(const std::string& filename_, bool is_system); - ~SQLite(); - SQLite(SQLite&&) noexcept; - SQLite& operator=(SQLite&&) noexcept; - SQLite& operator=(const SQLite&) = delete; - bool Valid() const; - result_type Exec(const std::string& query) const; - int Changes() const; - int Retry(std::function) const; - static int Retry(std::function f, std::string filename); - std::string ErrorMessage() const; -}; - -template -class SQLiteBase -{ - protected: - using exclusive_lock = boost::unique_lock; - using shared_lock = boost::shared_lock; - static boost::system_time GetLockTimeout() - { - return boost::get_system_time() + boost::posix_time::milliseconds(60000); - } - - public: - SQLiteBase(const std::string& filename_, - bool is_system, - const std::string& arch_, - std::size_t num_cu_) - : filename(filename_), - arch(arch_), - num_cu(num_cu_), - lock_file(LockFile::Get(LockFilePath(filename_).c_str())) - { - MIOPEN_LOG_I2("Initializing " << (is_system ? "system" : "user") << " database file " - << filename); - - if(filename.empty()) - { - dbInvalid = true; - return; - } - - if(!is_system && !filename.empty()) - { - auto file = boost::filesystem::path(filename_); - const auto directory = file.remove_filename(); - if(directory.string().empty()) - { - dbInvalid = true; - return; - } - - if(!(boost::filesystem::exists(directory))) - { - if(!boost::filesystem::create_directories(directory)) - MIOPEN_LOG_W("Unable to create a directory: " << directory); - else - boost::filesystem::permissions(directory, boost::filesystem::all_all); - } - } - sql = std::move(SQLite{filename_, is_system}); - if(!sql.Valid()) - { - dbInvalid = true; - if(!is_system) - MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_); - else - MIOPEN_LOG_W("Unable to read system database file:" + filename_ + - " Performance may degrade"); - } - else - dbInvalid = false; - } - - static Derived& - GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu); - // TODO: Fix this for the overhead of having fields per record - - inline auto CheckTableColumns(const std::string& tableName, - const std::vector& goldenList) const - { - const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");"; - SQLite::result_type cfg_res; - { - const auto lock = shared_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - cfg_res = sql.Exec(sql_cfg_fds); - } - std::vector cfg_fds(cfg_res.size()); - std::transform( - cfg_res.begin(), cfg_res.end(), cfg_fds.begin(), [](auto row) { return row["name"]; }); - // search in the golden vector - bool AllFound = true; - for(auto& goldenName : goldenList) - { - if(std::find(cfg_fds.begin(), cfg_fds.end(), goldenName) == cfg_fds.end()) - { - AllFound = false; - std::ostringstream ss; - ss << "Field " << goldenName << " not found in table: " << tableName; - MIOPEN_LOG_I2(ss.str()); - // break; Not breaking to enable logging of all missing fields. - } - } - return AllFound; - } - - inline auto SQLExec(const std::string& query) - { - MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query); - { - auto rc = SQLRety([&]() { - return sqlite3_exec(ptrDb.get(), query.c_str(), find_callback, nullptr, nullptr); - }); - if(rc != SQLITE_OK) - { - MIOPEN_LOG_I2(query); - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - sqlite3_close(ptrDb.get()); - return false; - } - } - return true; - } - inline auto SQLExec(const std::string& query, SQLRes_t& res) const - { - res.clear(); - MIOPEN_LOG_T(std::this_thread::get_id() << ":" << query); - { - auto rc = SQLRety([&]() { - return sqlite3_exec( - ptrDb.get(), query.c_str(), find_callback, static_cast(&res), nullptr); - }); - if(rc != SQLITE_OK) - { - MIOPEN_LOG_I2(query); - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - sqlite3_close(ptrDb.get()); - return false; - } - } - return true; - } - - template - inline int SQLRety(F f) const - { - return SQLiteBase::SQLRety(f, filename); - } - - template - static inline int SQLRety(F f, std::string filename) - { - auto timeout_end = std::chrono::high_resolution_clock::now() + - std::chrono::seconds(30); // TODO: make configurable - auto tries = 0; - while(true) - { - int rc = f(); - if(rc == SQLITE_BUSY) - { - MIOPEN_LOG_I2("Database" + filename + " busy, retrying ..."); - ++tries; - if(tries > 50) - std::this_thread::sleep_for(std::chrono::microseconds(100)); - else - std::this_thread::yield(); - } - else - return rc; - if(std::chrono::high_resolution_clock::now() > timeout_end) - MIOPEN_THROW("Timeout while waiting for Database: " + filename); - } - } - - inline std::string SQLErrorMessage() const - { - std::ostringstream ss; - ss << "Internal error while accessing SQLite database: "; - ss << sqlite3_errstr(sqlite3_errcode(ptrDb.get())) << ":"; - ss << sqlite3_errmsg(ptrDb.get()); - return ss.str(); - } - - auto Prepare(const std::string& query) const - { - sqlite3_stmt* ptr = nullptr; - MIOPEN_LOG_I2(query); - auto rc = sqlite3_prepare_v2(ptrDb.get(), query.c_str(), query.size(), &ptr, nullptr); - if(rc != SQLITE_OK) - { - std::string err_msg = "SQLite prepare error: "; - MIOPEN_THROW(miopenStatusInternalError, err_msg + sqlite3_errmsg(ptrDb.get())); - } - return sqlite3_stmt_ptr{ptr}; - } - auto PrepareAndBind(const std::string& query, std::vector& values) const - { - auto stmt = Prepare(query); - int cnt = 1; - for(auto& kinder : values) - { - auto rc = sqlite3_bind_text( - stmt.get(), cnt++, kinder.data(), kinder.size(), SQLITE_TRANSIENT); // NOLINT - if(rc != SQLITE_OK) - MIOPEN_THROW(miopenStatusInternalError, SQLErrorMessage()); - } - MIOPEN_LOG_I2("[" << JoinStrings(values, ",") << "]"); - return stmt; - } - - template - inline auto FindRecord(U&... args) - { - const auto lock = shared_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->FindRecordUnsafe(args...); - } - - template - inline auto RemoveRecord(U&... args) - { - const auto lock = exclusive_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->RemoveRecordUnsafe(args...); - } - - template - inline auto StoreRecord(U&... args) - { - const auto lock = exclusive_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->StoreRecordUnsafe(args...); - } - - template - inline auto Remove(const U&... args) - { - const auto lock = exclusive_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->RemoveUnsafe(args...); - } - - template - inline auto Update(const U&... args) - { - const auto lock = exclusive_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->UpdateUnsafe(args...); - } - - template - inline auto Load(U&&... args) - { - const auto lock = shared_lock(lock_file, GetLockTimeout()); - MIOPEN_VALIDATE_LOCK(lock); - return reinterpret_cast(this)->LoadUnsafe(args...); - } - - std::string filename; - std::string arch; - size_t num_cu; - LockFile& lock_file; - bool dbInvalid; - SQLite sql; -}; - -template -Derived& SQLiteBase::GetCached(const std::string& path, - bool is_system, - const std::string& arch, - const size_t num_cu) -{ - static std::mutex mutex; - static const std::lock_guard lock{mutex}; - - static auto instances = std::map{}; - const auto it = instances.find(path); - - if(it != instances.end()) - return *(it->second); - - instances.emplace(path, new Derived{path, is_system, arch, num_cu}); // NOLINT - return *(instances.at(path)); -} - -class SQLitePerfDb : public SQLiteBase -{ - public: - static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0"; - SQLitePerfDb(const std::string& filename_, - bool is_system, - const std::string& arch_, - std::size_t num_cu_); - - template - inline void InsertConfig(const T& prob_desc) - { - std::string clause; - std::vector vals; - std::tie(clause, vals) = prob_desc.InsertQuery(); - auto stmt = SQLite::Statement{sql, clause, vals}; - auto rc = stmt.Step(sql); - if(rc != SQLITE_DONE) - MIOPEN_THROW(miopenStatusInternalError, - "Failed to insert config: " + sql.ErrorMessage()); - auto cnt = sql.Changes(); - MIOPEN_LOG_I2(cnt << " rows updated"); - } - template - inline std::string GetConfigIDs(const T& prob_desc) - { - std::string clause; - std::vector vals; - std::tie(clause, vals) = prob_desc.WhereClause(); - auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );"; - auto stmt = SQLite::Statement{sql, query, vals}; - while(true) - { - auto rc = stmt.Step(sql); - if(rc == SQLITE_ROW) - return stmt.ColumnText(0); - else if(rc == SQLITE_DONE) - return ""; - else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) - MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); - } - } - template - inline boost::optional FindRecordUnsafe(const T& problem_config) - { - if(dbInvalid) - return boost::none; - std::string clause; - std::vector values; - std::tie(clause, values) = problem_config.WhereClause(); - // clang-format off - auto select_query = - "SELECT solver, params " - "FROM perf_db " - "INNER JOIN " + problem_config.table_name() + " " - "ON perf_db.config = " + problem_config.table_name() +".id " - "WHERE " - "( " + clause + " )" - "AND (arch = '" + arch + "' ) " - "AND (num_cu = '" + std::to_string(num_cu) + "');"; - // clang-format on - auto stmt = SQLite::Statement{sql, select_query, values}; - DbRecord rec; - while(true) - { - auto rc = stmt.Step(sql); - if(rc == SQLITE_ROW) - rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1)); - else if(rc == SQLITE_DONE) - break; - else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) - MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); - } - if(rec.GetSize() == 0) - return boost::none; - else - return boost::optional(rec); - } - - /// Removes ID with associated VALUES from record with key PROBLEM_CONFIG from db. - /// - /// Returns true if remove was successful. Returns false if this PROBLEM_CONFIG or ID was not - /// found. - template - inline bool RemoveUnsafe(const T& problem_config, const std::string& id) - { - if(dbInvalid) - return false; - std::string clause; - std::vector values; - std::tie(clause, values) = problem_config.WhereClause(); - // clang-format off - auto query = - "DELETE FROM perf_db " - "WHERE config IN (" - "SELECT id FROM config WHERE ( " - + clause + " ) )" - "AND solver == '" + id + "' ;"; - // clang-format on - auto stmt = SQLite::Statement{sql, query, values}; - auto rc = stmt.Step(sql); - if(rc == SQLITE_DONE) - return true; - else - { - std::string msg = "Unable to remove database entry: "; - MIOPEN_LOG_E(msg + sql.ErrorMessage()); - return false; - } - } - - /// Updates record under key PROBLEM_CONFIG with data ID:VALUES in database. - /// Returns updated record or boost::none if insertion failed - template - inline boost::optional - UpdateUnsafe(const T& problem_config, const std::string& id, const V& values) - { - if(dbInvalid) - return boost::none; - // UPSERT the value - { - std::string clause; - std::vector vals; - std::tie(clause, vals) = problem_config.InsertQuery(); - auto stmt = SQLite::Statement{sql, clause, vals}; - auto rc = stmt.Step(sql); - if(rc != SQLITE_DONE) - MIOPEN_THROW(miopenStatusInternalError, - "Failed to insert config: " + sql.ErrorMessage()); - auto cnt = sql.Changes(); - MIOPEN_LOG_I2(cnt << " rows updated"); - } - - // UPSERT perf values - { - std::ostringstream params; - values.Serialize(params); - std::string clause; - std::vector vals; - std::tie(clause, vals) = problem_config.WhereClause(); - - // clang-format off - std::string query = - "INSERT OR REPLACE INTO " - "perf_db(config, solver, params, arch, num_cu) " - "VALUES(" - "(SELECT id FROM " + problem_config.table_name() + " " - "WHERE ( " + clause + " ) ) , ? , ? , ? , ?);"; - // clang-format on - vals.push_back(id); - vals.push_back(params.str()); - vals.push_back(arch); - vals.push_back(std::to_string(num_cu)); - auto stmt = SQLite::Statement{sql, query, vals}; - auto rc = stmt.Step(sql); - if(rc != SQLITE_DONE) - { - MIOPEN_LOG_E("Failed to insert performance record in the database: " + - sql.ErrorMessage()); - return boost::none; - } - } - DbRecord record; - record.SetValues(id, values); - return record; - } - - template - inline bool StoreRecordUnsafe(const T& problem_config, const std::string& id, const V& values) - { - if(dbInvalid) - return false; - return bool(UpdateUnsafe(problem_config, id, values)); - } - - /** - * clears both the config and the associated solver values from the database - */ - template - inline bool ClearRecordUnsafe(const T& problem_config) - { - if(dbInvalid) - return true; - std::string clause; - std::vector values; - std::tie(clause, values) = problem_config.WhereClause(); - // clang-format off - auto query = - "DELETE FROM perf_db " - "WHERE config IN (" - "SELECT id FROM config WHERE ( " - + clause + " ))"; - // clang-format on - auto stmt = SQLite::Statement{sql, query, values}; - auto rc = stmt.Step(sql); - if(rc != SQLITE_DONE) - { - MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage()); - return false; - } - else - return true; - } - - /// Searches for record with key PROBLEM_CONFIG and gets VALUES under the ID from it. - /// Class T should have "void Serialize(PDAttr_t&) const" member function available. - /// Class V shall have "bool Deserialize(const std::string& str)" member function available. - /// - /// Returns false if the problem config is not found in the config table or if there are perf - /// parameters in the perf_db table - template - inline bool LoadUnsafe(const T& problem_config, const std::string& id, V& values) - { - if(dbInvalid) - return false; - const auto record = FindRecordUnsafe(problem_config); - - if(!record) - return false; - return record->GetValues(id, values); - } -}; -} // namespace miopen From 4c997ea81ec26120e71fcfca963d86168c916872 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 10:28:34 +0800 Subject: [PATCH 08/18] merge sqlite hpp 2 --- src/include/miopen/sqlite_db.hpp | 596 +++++++++++++++++++++++++++++++ 1 file changed, 596 insertions(+) diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index e69de29bb2..76bc3597a1 100755 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -0,0 +1,596 @@ +/******************************************************************************* +* +* MIT License +* +* Copyright (c) 2019 Advanced Micro Devices, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in all +* copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "sqlite3.h" +#include +#include + +#include +#include +#include + +namespace boost { +namespace filesystem { +class path; +} // namespace filesystem +} // namespace boost + +namespace miopen { + +#define MIOPEN_VALIDATE_LOCK(lock) \ + do \ + { \ + if(!(lock)) \ + MIOPEN_THROW("Db lock has failed to lock."); \ + } while(false) + +template +struct SQLiteSerializable +{ + std::vector FieldNames() const + { + std::vector names; + Derived::Visit(static_cast(*this), + [&](const std::string& value, const std::string& name) { + std::ignore = value; + names.push_back(name); + }); + Derived::Visit(static_cast(*this), + [&](const int value, const std::string name) { + std::ignore = value; + names.push_back(name); + }); + + return names; + } + std::tuple> WhereClause() const + { + std::vector values; + std::vector clauses; + Derived::Visit(static_cast(*this), + [&](const std::string& value, const std::string& name) { + clauses.push_back("(" + name + " = ? )"); + values.push_back(value); + }); + Derived::Visit(static_cast(*this), + [&](const int value, const std::string name) { + clauses.push_back("(" + name + " = ? )"); + values.push_back(std::to_string(value)); + }); + std::string clause = JoinStrings(clauses, " AND "); + return std::make_tuple(clause, values); + } + std::tuple> InsertQuery() const + { + std::vector int_names, str_names, values; + Derived::Visit(static_cast(*this), + [&](const std::string& value, const std::string& name) { + str_names.push_back(name); + values.push_back(value); + }); + Derived::Visit(static_cast(*this), + [&](const int value, const std::string name) { + int_names.push_back(name); + values.push_back(std::to_string(value)); + }); + std::vector tokens((values.size()), "?"); + ; + + std::string q = "INSERT OR IGNORE INTO " + Derived::table_name() + "( " + + JoinStrings(str_names, ",") + "," + JoinStrings(int_names, ",") + + " ) VALUES( " + JoinStrings(tokens, ",") + ");"; + return std::make_tuple(q, values); + } + std::tuple> SelectQuery() const + { + std::string clauses; + std::vector values; + std::tie(clauses, values) = WhereClause(); + std::string query = "SELECT id FROM " + Derived::table_name() + " WHERE " + clauses + ";"; + return std::make_tuple(query, values); + } + + std::string CreateQuery() const + { + std::vector str_fields; + Derived::Visit(static_cast(*this), + [&](const std::string value, const std::string name) { + std::ignore = value; + str_fields.push_back(name); + }); + std::vector int_fields; + Derived::Visit(static_cast(*this), + [&](const int value, const std::string name) { + std::ignore = value; + int_fields.push_back(name); + }); + std::ostringstream ss; + ss << "CREATE TABLE IF NOT EXISTS `" << Derived::table_name() << "` (" + << "`id` INTEGER PRIMARY KEY ASC"; + for(auto& el : str_fields) + ss << ",`" << el << "` TEXT NOT NULL"; + for(auto& el : int_fields) + ss << ",`" << el << "` INT NOT NULL"; + ss << ");"; + ss << "CREATE UNIQUE INDEX IF NOT EXISTS " + << "`idx_" << Derived::table_name() << "` " + << "ON " << Derived::table_name() << "( " << miopen::JoinStrings(str_fields, ",") << ", " + << miopen::JoinStrings(int_fields, ",") << " );"; + return ss.str(); + } +}; + +class SQLite +{ + class impl; + // do we need propagate const + std::unique_ptr pImpl; + + public: + class Statement + { + class impl; + std::unique_ptr pImpl; + + public: + Statement(const SQLite& sql, const std::string& query); + Statement(const SQLite& sql, + const std::string& query, + const std::vector& vals); + Statement(); + ~Statement(); + Statement(Statement&&) noexcept; + Statement& operator=(Statement&&) noexcept; + Statement& operator=(const Statement&) = delete; + int Step(const SQLite& sql); + std::string ColumnText(int idx); + std::string ColumnBlob(int idx); + int64_t ColumnInt64(int idx); + int BindText(int idx, const std::string& txt); + int BindBlob(int idx, const std::string& blob); + int BindInt64(int idx, int64_t); + }; + + using result_type = std::vector>; + SQLite(); + SQLite(const std::string& filename_, bool is_system); + ~SQLite(); + SQLite(SQLite&&) noexcept; + SQLite& operator=(SQLite&&) noexcept; + SQLite& operator=(const SQLite&) = delete; + bool Valid() const; + result_type Exec(const std::string& query) const; + int Changes() const; + int Retry(std::function) const; + static int Retry(std::function f, std::string filename); + std::string ErrorMessage() const; +}; + +template +class SQLiteBase +{ + protected: + using exclusive_lock = boost::unique_lock; + using shared_lock = boost::shared_lock; + static boost::system_time GetLockTimeout() + { + return boost::get_system_time() + boost::posix_time::milliseconds(60000); + } + + public: + SQLiteBase(const std::string& filename_, + bool is_system, + const std::string& arch_, + std::size_t num_cu_) + : filename(filename_), + arch(arch_), + num_cu(num_cu_), + lock_file(LockFile::Get(LockFilePath(filename_).c_str())) + { + MIOPEN_LOG_I2("Initializing " << (is_system ? "system" : "user") << " database file " + << filename); + + if(filename.empty()) + { + dbInvalid = true; + return; + } + + if(!is_system && !filename.empty()) + { + auto file = boost::filesystem::path(filename_); + const auto directory = file.remove_filename(); + if(directory.string().empty()) + { + dbInvalid = true; + return; + } + + if(!(boost::filesystem::exists(directory))) + { + if(!boost::filesystem::create_directories(directory)) + MIOPEN_LOG_W("Unable to create a directory: " << directory); + else + boost::filesystem::permissions(directory, boost::filesystem::all_all); + } + } + sql = std::move(SQLite{filename_, is_system}); + if(!sql.Valid()) + { + dbInvalid = true; + if(!is_system) + MIOPEN_THROW(miopenStatusInternalError, "Cannot open database file:" + filename_); + else + MIOPEN_LOG_W("Unable to read system database file:" + filename_ + + " Performance may degrade"); + } + else + dbInvalid = false; + } + + static Derived& + GetCached(const std::string& path, bool is_system, const std::string& arch, std::size_t num_cu); + // TODO: Fix this for the overhead of having fields per record + + inline auto CheckTableColumns(const std::string& tableName, + const std::vector& goldenList) const + { + const auto sql_cfg_fds = "PRAGMA table_info(" + tableName + ");"; + SQLite::result_type cfg_res; + { + const auto lock = shared_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + cfg_res = sql.Exec(sql_cfg_fds); + } + std::vector cfg_fds(cfg_res.size()); + std::transform( + cfg_res.begin(), cfg_res.end(), cfg_fds.begin(), [](auto row) { return row["name"]; }); + // search in the golden vector + bool AllFound = true; + for(auto& goldenName : goldenList) + { + if(std::find(cfg_fds.begin(), cfg_fds.end(), goldenName) == cfg_fds.end()) + { + AllFound = false; + std::ostringstream ss; + ss << "Field " << goldenName << " not found in table: " << tableName; + MIOPEN_LOG_I2(ss.str()); + // break; Not breaking to enable logging of all missing fields. + } + } + return AllFound; + } + + template + inline auto FindRecord(U&... args) + { + const auto lock = shared_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->FindRecordUnsafe(args...); + } + + template + inline auto RemoveRecord(U&... args) + { + const auto lock = exclusive_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->RemoveRecordUnsafe(args...); + } + + template + inline auto StoreRecord(U&... args) + { + const auto lock = exclusive_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->StoreRecordUnsafe(args...); + } + + template + inline auto Remove(const U&... args) + { + const auto lock = exclusive_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->RemoveUnsafe(args...); + } + + template + inline auto Update(const U&... args) + { + const auto lock = exclusive_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->UpdateUnsafe(args...); + } + + template + inline auto Load(U&&... args) + { + const auto lock = shared_lock(lock_file, GetLockTimeout()); + MIOPEN_VALIDATE_LOCK(lock); + return reinterpret_cast(this)->LoadUnsafe(args...); + } + + std::string filename; + std::string arch; + size_t num_cu; + LockFile& lock_file; + bool dbInvalid; + SQLite sql; +}; + +template +Derived& SQLiteBase::GetCached(const std::string& path, + bool is_system, + const std::string& arch, + const size_t num_cu) +{ + static std::mutex mutex; + static const std::lock_guard lock{mutex}; + + static auto instances = std::map{}; + const auto it = instances.find(path); + + if(it != instances.end()) + return *(it->second); + + instances.emplace(path, new Derived{path, is_system, arch, num_cu}); // NOLINT + return *(instances.at(path)); +} + +class SQLitePerfDb : public SQLiteBase +{ + public: + static constexpr char const* MIOPEN_PERFDB_SCHEMA_VER = "1.0.0"; + SQLitePerfDb(const std::string& filename_, + bool is_system, + const std::string& arch_, + std::size_t num_cu_); + + template + inline void InsertConfig(const T& prob_desc) + { + std::string clause; + std::vector vals; + std::tie(clause, vals) = prob_desc.InsertQuery(); + auto stmt = SQLite::Statement{sql, clause, vals}; + auto rc = stmt.Step(sql); + if(rc != SQLITE_DONE) + MIOPEN_THROW(miopenStatusInternalError, + "Failed to insert config: " + sql.ErrorMessage()); + auto cnt = sql.Changes(); + MIOPEN_LOG_I2(cnt << " rows updated"); + } + template + inline std::string GetConfigIDs(const T& prob_desc) + { + std::string clause; + std::vector vals; + std::tie(clause, vals) = prob_desc.WhereClause(); + auto query = "SELECT id FROM " + prob_desc.table_name() + " WHERE ( " + clause + " );"; + auto stmt = SQLite::Statement{sql, query, vals}; + while(true) + { + auto rc = stmt.Step(sql); + if(rc == SQLITE_ROW) + return stmt.ColumnText(0); + else if(rc == SQLITE_DONE) + return ""; + else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); + } + } + template + inline boost::optional FindRecordUnsafe(const T& problem_config) + { + if(dbInvalid) + return boost::none; + std::string clause; + std::vector values; + std::tie(clause, values) = problem_config.WhereClause(); + // clang-format off + auto select_query = + "SELECT solver, params " + "FROM perf_db " + "INNER JOIN " + problem_config.table_name() + " " + "ON perf_db.config = " + problem_config.table_name() +".id " + "WHERE " + "( " + clause + " )" + "AND (arch = '" + arch + "' ) " + "AND (num_cu = '" + std::to_string(num_cu) + "');"; + // clang-format on + auto stmt = SQLite::Statement{sql, select_query, values}; + DbRecord rec; + while(true) + { + auto rc = stmt.Step(sql); + if(rc == SQLITE_ROW) + rec.SetValues(stmt.ColumnText(0), stmt.ColumnText(1)); + else if(rc == SQLITE_DONE) + break; + else if(rc == SQLITE_ERROR || rc == SQLITE_MISUSE) + MIOPEN_THROW(miopenStatusInternalError, sql.ErrorMessage()); + } + if(rec.GetSize() == 0) + return boost::none; + else + return boost::optional(rec); + } + + /// Removes ID with associated VALUES from record with key PROBLEM_CONFIG from db. + /// + /// Returns true if remove was successful. Returns false if this PROBLEM_CONFIG or ID was not + /// found. + template + inline bool RemoveUnsafe(const T& problem_config, const std::string& id) + { + if(dbInvalid) + return false; + std::string clause; + std::vector values; + std::tie(clause, values) = problem_config.WhereClause(); + // clang-format off + auto query = + "DELETE FROM perf_db " + "WHERE config IN (" + "SELECT id FROM config WHERE ( " + + clause + " ) )" + "AND solver == '" + id + "' ;"; + // clang-format on + auto stmt = SQLite::Statement{sql, query, values}; + auto rc = stmt.Step(sql); + if(rc == SQLITE_DONE) + return true; + else + { + std::string msg = "Unable to remove database entry: "; + MIOPEN_LOG_E(msg + sql.ErrorMessage()); + return false; + } + } + + /// Updates record under key PROBLEM_CONFIG with data ID:VALUES in database. + /// Returns updated record or boost::none if insertion failed + template + inline boost::optional + UpdateUnsafe(const T& problem_config, const std::string& id, const V& values) + { + if(dbInvalid) + return boost::none; + // UPSERT the value + { + std::string clause; + std::vector vals; + std::tie(clause, vals) = problem_config.InsertQuery(); + auto stmt = SQLite::Statement{sql, clause, vals}; + auto rc = stmt.Step(sql); + if(rc != SQLITE_DONE) + MIOPEN_THROW(miopenStatusInternalError, + "Failed to insert config: " + sql.ErrorMessage()); + auto cnt = sql.Changes(); + MIOPEN_LOG_I2(cnt << " rows updated"); + } + + // UPSERT perf values + { + std::ostringstream params; + values.Serialize(params); + std::string clause; + std::vector vals; + std::tie(clause, vals) = problem_config.WhereClause(); + + // clang-format off + std::string query = + "INSERT OR REPLACE INTO " + "perf_db(config, solver, params, arch, num_cu) " + "VALUES(" + "(SELECT id FROM " + problem_config.table_name() + " " + "WHERE ( " + clause + " ) ) , ? , ? , ? , ?);"; + // clang-format on + vals.push_back(id); + vals.push_back(params.str()); + vals.push_back(arch); + vals.push_back(std::to_string(num_cu)); + auto stmt = SQLite::Statement{sql, query, vals}; + auto rc = stmt.Step(sql); + if(rc != SQLITE_DONE) + { + MIOPEN_LOG_E("Failed to insert performance record in the database: " + + sql.ErrorMessage()); + return boost::none; + } + } + DbRecord record; + record.SetValues(id, values); + return record; + } + + template + inline bool StoreRecordUnsafe(const T& problem_config, const std::string& id, const V& values) + { + if(dbInvalid) + return false; + return bool(UpdateUnsafe(problem_config, id, values)); + } + + /** + * clears both the config and the associated solver values from the database + */ + template + inline bool ClearRecordUnsafe(const T& problem_config) + { + if(dbInvalid) + return true; + std::string clause; + std::vector values; + std::tie(clause, values) = problem_config.WhereClause(); + // clang-format off + auto query = + "DELETE FROM perf_db " + "WHERE config IN (" + "SELECT id FROM config WHERE ( " + + clause + " ))"; + // clang-format on + auto stmt = SQLite::Statement{sql, query, values}; + auto rc = stmt.Step(sql); + if(rc != SQLITE_DONE) + { + MIOPEN_LOG_E("Unable to Clear databaes entry: " + sql.ErrorMessage()); + return false; + } + else + return true; + } + + /// Searches for record with key PROBLEM_CONFIG and gets VALUES under the ID from it. + /// Class T should have "void Serialize(PDAttr_t&) const" member function available. + /// Class V shall have "bool Deserialize(const std::string& str)" member function available. + /// + /// Returns false if the problem config is not found in the config table or if there are perf + /// parameters in the perf_db table + template + inline bool LoadUnsafe(const T& problem_config, const std::string& id, V& values) + { + if(dbInvalid) + return false; + const auto record = FindRecordUnsafe(problem_config); + + if(!record) + return false; + return record->GetValues(id, values); + } +}; +} // namespace miopen \ No newline at end of file From ab73c1caeb939c052983ff81f14757bb91e33f64 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 11:32:54 +0800 Subject: [PATCH 09/18] merge sqlitedb.hpp --- src/include/miopen/sqlite_db.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/miopen/sqlite_db.hpp b/src/include/miopen/sqlite_db.hpp index 76bc3597a1..3c80254087 100755 --- a/src/include/miopen/sqlite_db.hpp +++ b/src/include/miopen/sqlite_db.hpp @@ -593,4 +593,4 @@ class SQLitePerfDb : public SQLiteBase return record->GetValues(id, values); } }; -} // namespace miopen \ No newline at end of file +} // namespace miopen From fcf342c658a052cfe88574f31574a18aa1bd175d Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 6 May 2020 21:06:24 +0800 Subject: [PATCH 10/18] add v4r1 bwd xdlops kernel in invoker --- src/conv/invokers/impl_gemm.cpp | 1 + src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/conv/invokers/impl_gemm.cpp b/src/conv/invokers/impl_gemm.cpp index 4b2ddc8ddc..412dd342ef 100644 --- a/src/conv/invokers/impl_gemm.cpp +++ b/src/conv/invokers/impl_gemm.cpp @@ -143,6 +143,7 @@ InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx) // clang-format off else if( kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw" || + kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw" || kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw") // clang-format on { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index ae79d911f5..4a4777e9d5 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -216,7 +216,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& 1, GemmBBlockCopyThreadSliceLengths_GemmN, GemmABlockCopyThreadSliceLengths_GemmM, - 1); + GetEPackLength(ctx, true)); return lds_size <= 64 * 1024; } From f148d26a321713d21c4992a3fd6fc966ff944e04 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Thu, 7 May 2020 09:44:05 +0800 Subject: [PATCH 11/18] delete dead code in igemm bwd xdlops solver --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 4a4777e9d5..364ff9f04a 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -510,15 +510,8 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig( PerformanceImplicitGemmBwdDataV4R1Xdlops ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx) const { - // \todo add fp16 and bfp16 kernels return GenericSearchBwd(*this, ctx); - - // fp16/bfp16 uses fp32 workspace to leverage fp32 atomic add - // if(ctx.IsFp16() || ctx.IsBfp16()) - // return GenericSearchBwd(*this, ctx, SearchTweak::WorkspaceInsteadOfXBuffer); - // else - // return GenericSearchBwd(*this, ctx); } int ConvHipImplicitGemmBwdDataV4R1Xdlops::RunAndMeasureSolution(miopen::Handle& profile_h, From 9edd884add74bf5f5805da9124ee2101c238a33c Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Thu, 7 May 2020 10:53:04 +0800 Subject: [PATCH 12/18] update license information for igemm bwd xdlops solver --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 364ff9f04a..1ee8e717cc 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2019 Advanced Micro Devices, Inc. + * Copyright (c) 2020 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From c7b28ba86559af82a1c38e24cac68be2a399ce3c Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Thu, 7 May 2020 11:13:37 +0800 Subject: [PATCH 13/18] rename vars: use GemmA/B instead of In/WeiBlock --- src/include/miopen/solver.hpp | 16 ++-- ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 77 +++++++++---------- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index a317638bf1..be62dffe12 100755 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -678,11 +678,11 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops int GemmMPerWave; int GemmNPerWave; - int InBlockCopyClusterLengths_GemmK; // 2^n[4..16] - int InBlockCopyClusterLengths_GemmN; // 2^n[8..64] + int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16] + int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64] - int WeiBlockCopyClusterLengths_GemmK; // 2^n[1..4] - int WeiBlockCopyClusterLengths_GemmM; // 2^n[16..128] + int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4] + int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128] bool use_spare_set; @@ -711,10 +711,10 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops f(self.GemmKPerBlock, "GemmKPerBlock"); f(self.GemmMPerWave, "GemmMPerWave"); f(self.GemmNPerWave, "GemmNPerWave"); - f(self.InBlockCopyClusterLengths_GemmK, "InBlockCopyClusterLengths_GemmK"); - f(self.InBlockCopyClusterLengths_GemmN, "InBlockCopyClusterLengths_GemmN"); - f(self.WeiBlockCopyClusterLengths_GemmK, "WeiBlockCopyClusterLengths_GemmK"); - f(self.WeiBlockCopyClusterLengths_GemmM, "WeiBlockCopyClusterLengths_GemmM"); + f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK"); + f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN"); + f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK"); + f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM"); } std::tuple CalculateGridSize(const ConvolutionContext& ctx) const; diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 1ee8e717cc..03e0041a91 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -79,7 +79,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara SrcDataPerRead_GemmM = 1; // calculate threadwise copy size - const auto a_data_per_thread_copy = GemmMPerBlock / WeiBlockCopyClusterLengths_GemmM; + const auto a_data_per_thread_copy = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; if(!(a_data_per_thread_copy > 0)) MIOPEN_THROW("invalid performance parameter"); @@ -127,7 +127,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara } // calculate threadwise copy size - int b_data_per_thread_copy = GemmNPerBlock / InBlockCopyClusterLengths_GemmN; + int b_data_per_thread_copy = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; if(!(b_data_per_thread_copy > 0)) MIOPEN_THROW("invalid performance parameter"); @@ -166,11 +166,6 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& return false; // wrong! cannot divice N evenly among thread } - const auto& GemmBBlockCopyClusterLengths_GemmK = InBlockCopyClusterLengths_GemmK; - const auto& GemmBBlockCopyClusterLengths_GemmN = InBlockCopyClusterLengths_GemmN; - const auto& GemmABlockCopyClusterLengths_GemmK = WeiBlockCopyClusterLengths_GemmK; - const auto& GemmABlockCopyClusterLengths_GemmM = WeiBlockCopyClusterLengths_GemmM; - if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 && GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 && GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 && @@ -229,35 +224,35 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo GemmMPerWave = spare ? 4 : 64; GemmNPerWave = spare ? 16 : 64; - InBlockCopyClusterLengths_GemmK = 4; - InBlockCopyClusterLengths_GemmN = 4; + GemmBBlockCopyClusterLengths_GemmK = 4; + GemmBBlockCopyClusterLengths_GemmN = 4; - WeiBlockCopyClusterLengths_GemmK = 2; - WeiBlockCopyClusterLengths_GemmM = 4; + GemmABlockCopyClusterLengths_GemmK = 2; + GemmABlockCopyClusterLengths_GemmM = 4; use_spare_set = spare; } PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops( - int BPerBlock_, - int KPerBlock_, - int EPerBlock_, + int GemmNPerBlock_, + int GemmMPerBlock_, + int GemmKPerBlock_, int GemmMPerWave_, int GemmNPerWave_, - int InBlockCopyClusterLengths_E_, - int InBlockCopyClusterLengths_B_, - int WeiBlockCopyClusterLengths_E_, - int WeiBlockCopyClusterLengths_K_, + int GemmBBlockCopyClusterLengths_GemmK_, + int GemmBBlockCopyClusterLengths_GemmN_, + int GemmABlockCopyClusterLengths_GemmK_, + int GemmABlockCopyClusterLengths_GemmM_, bool use_spare_set_) - : GemmNPerBlock(BPerBlock_), - GemmMPerBlock(KPerBlock_), - GemmKPerBlock(EPerBlock_), + : GemmNPerBlock(GemmNPerBlock_), + GemmMPerBlock(GemmMPerBlock_), + GemmKPerBlock(GemmKPerBlock_), GemmMPerWave(GemmMPerWave_), GemmNPerWave(GemmNPerWave_), - InBlockCopyClusterLengths_GemmK(InBlockCopyClusterLengths_E_), - InBlockCopyClusterLengths_GemmN(InBlockCopyClusterLengths_B_), - WeiBlockCopyClusterLengths_GemmK(WeiBlockCopyClusterLengths_E_), - WeiBlockCopyClusterLengths_GemmM(WeiBlockCopyClusterLengths_K_), + GemmBBlockCopyClusterLengths_GemmK(GemmBBlockCopyClusterLengths_GemmK_), + GemmBBlockCopyClusterLengths_GemmN(GemmBBlockCopyClusterLengths_GemmN_), + GemmABlockCopyClusterLengths_GemmK(GemmABlockCopyClusterLengths_GemmK_), + GemmABlockCopyClusterLengths_GemmM(GemmABlockCopyClusterLengths_GemmM_), use_spare_set(use_spare_set_) { } @@ -271,10 +266,10 @@ operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const && GemmKPerBlock == other.GemmKPerBlock && GemmMPerWave == other.GemmMPerWave && GemmNPerWave == other.GemmNPerWave - && InBlockCopyClusterLengths_GemmK == other.InBlockCopyClusterLengths_GemmK - && InBlockCopyClusterLengths_GemmN == other.InBlockCopyClusterLengths_GemmN - && WeiBlockCopyClusterLengths_GemmK == other.WeiBlockCopyClusterLengths_GemmK - && WeiBlockCopyClusterLengths_GemmM == other.WeiBlockCopyClusterLengths_GemmM + && GemmBBlockCopyClusterLengths_GemmK == other.GemmBBlockCopyClusterLengths_GemmK + && GemmBBlockCopyClusterLengths_GemmN == other.GemmBBlockCopyClusterLengths_GemmN + && GemmABlockCopyClusterLengths_GemmK == other.GemmABlockCopyClusterLengths_GemmK + && GemmABlockCopyClusterLengths_GemmM == other.GemmABlockCopyClusterLengths_GemmM && use_spare_set == other.use_spare_set; // clang-format on } @@ -287,10 +282,10 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const && IsTwoPower<4,32>(GemmKPerBlock) && IsTwoPower<4,64>(GemmMPerWave) && IsTwoPower<16,64>(GemmNPerWave) - && IsTwoPower<4,16>(InBlockCopyClusterLengths_GemmK) - && IsTwoPower<4,64>(InBlockCopyClusterLengths_GemmN) - && IsTwoPower<2,16>(WeiBlockCopyClusterLengths_GemmK) - && IsTwoPower<4,128>(WeiBlockCopyClusterLengths_GemmM); // clang-format on + && IsTwoPower<4,16>(GemmBBlockCopyClusterLengths_GemmK) + && IsTwoPower<4,64>(GemmBBlockCopyClusterLengths_GemmN) + && IsTwoPower<2,16>(GemmABlockCopyClusterLengths_GemmK) + && IsTwoPower<4,128>(GemmABlockCopyClusterLengths_GemmM); // clang-format on } bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() @@ -319,13 +314,13 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() if(!NextTwoPower<16, 64>(GemmNPerWave)) break; } - if(!NextTwoPower<4, 16>(InBlockCopyClusterLengths_GemmK)) + if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK)) break; - if(!NextTwoPower<4, 64>(InBlockCopyClusterLengths_GemmN)) + if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN)) break; - if(!NextTwoPower<2, 16>(WeiBlockCopyClusterLengths_GemmK)) + if(!NextTwoPower<2, 16>(GemmABlockCopyClusterLengths_GemmK)) break; - if(!NextTwoPower<4, 128>(WeiBlockCopyClusterLengths_GemmM)) + if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM)) break; return false; } while(false); @@ -653,12 +648,12 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) + std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) + std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmK) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.WeiBlockCopyClusterLengths_GemmM) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmM) + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM ) + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.InBlockCopyClusterLengths_GemmK) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.InBlockCopyClusterLengths_GemmN) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmN) + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN ) + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) + std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) + From 39f7cc01f2248c56fcb7faab8e6093fe73913d6c Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Thu, 7 May 2020 14:58:45 +0800 Subject: [PATCH 14/18] remove threadwise gemm's inliine macro --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 03e0041a91..114239fbdb 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -657,7 +657,6 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN ) + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) + std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) + - std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + std::string(" -DCK_USE_AMD_BUFFER_ATOMIC_ADD=") + (support_amd_buffer_atomic_add(ctx) ? '1' : '0') + std::string(" -DCK_USE_AMD_XDLOPS=") + std::to_string(IsXdlopsSupport(ctx) ? 1 : 0) + std::string(" -DCK_USE_AMD_XDLOPS_INLINE_ASM=") + std::to_string(miopen::IsEnabled(MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM{}) ? 1 : 0) + From 4a1e51149be5e9cfc90baa732a2c5179f58f4cab Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Fri, 8 May 2020 16:05:18 +0800 Subject: [PATCH 15/18] remove cluster lengths from tunable params --- src/include/miopen/solver.hpp | 23 +- ...hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 263 +++++++++++------- 2 files changed, 179 insertions(+), 107 deletions(-) mode change 100755 => 100644 src/include/miopen/solver.hpp mode change 100755 => 100644 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp old mode 100755 new mode 100644 index be62dffe12..650768f0b7 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -677,25 +677,24 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops int GemmMPerWave; int GemmNPerWave; - +#if 0 int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16] int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64] int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4] int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128] - +#endif bool use_spare_set; - PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, int, int, int, int, bool); + PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, bool); PerformanceImplicitGemmBwdDataV4R1Xdlops() - : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, -1, -1, -1, -1, false) + : PerformanceImplicitGemmBwdDataV4R1Xdlops(-1, -1, -1, -1, -1, false) { } - PerformanceImplicitGemmBwdDataV4R1Xdlops( - int a, int b, int c, int d, int e, int f, int g, int h, int i) - : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, f, g, h, i, false) + PerformanceImplicitGemmBwdDataV4R1Xdlops(int a, int b, int c, int d, int e) + : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, false) { } @@ -711,20 +710,20 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops f(self.GemmKPerBlock, "GemmKPerBlock"); f(self.GemmMPerWave, "GemmMPerWave"); f(self.GemmNPerWave, "GemmNPerWave"); +#if 0 f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK"); f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN"); f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK"); f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM"); +#endif } std::tuple CalculateGridSize(const ConvolutionContext& ctx) const; - std::tuple - CalculateBlockGemmPerformanceParameters(const ConvolutionContext& ctx) const; - std::tuple + std::tuple CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const; + std::tuple CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const; - std::tuple + std::tuple CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const; - // std::tuple CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const; bool IsValidValue() const; bool IsValid(const ConvolutionContext& ctx) const; void EuristicInit(const ConvolutionContext& ctx); diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp old mode 100755 new mode 100644 index 114239fbdb..0273e418b5 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -59,15 +59,21 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGridSize(const ConvolutionCon return std::make_tuple(GridSize, true); } -std::tuple +std::tuple PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformanceParameters( const ConvolutionContext& ctx) const { + int ClusterLengths_GemmK = 0; + int ClusterLengths_GemmM = 0; int SrcDataPerRead_GemmM = amd_buffer_load_max_length(); int DstDataPerWrite_GemmM = amd_lds_write_max_length(); try { + const auto WaveSize = 64; + const auto BlockSize = + GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize; + // calculate vector length on gemmk dimension SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, GemmMPerBlock); @@ -79,7 +85,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara SrcDataPerRead_GemmM = 1; // calculate threadwise copy size - const auto a_data_per_thread_copy = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; + const auto a_data_per_thread_copy = (GemmKPerBlock * GemmMPerBlock) / BlockSize; if(!(a_data_per_thread_copy > 0)) MIOPEN_THROW("invalid performance parameter"); @@ -87,26 +93,48 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmABlockCopyPerformancePara // GemmABlockCopySrcDataPerRead_GemmK also bounded by size of threadwise copy SrcDataPerRead_GemmM = gcd(SrcDataPerRead_GemmM, a_data_per_thread_copy); + // decide threadwise copy lengths + const auto a_data_per_thread_copy_gemmm = SrcDataPerRead_GemmM; + const auto a_data_per_thread_copy_gemmk = + a_data_per_thread_copy / a_data_per_thread_copy_gemmm; + // GemmABlockCopyDstDataPerWrite_GemmM also bounded by size of threadwise copy - DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, SrcDataPerRead_GemmM); + DstDataPerWrite_GemmM = gcd(DstDataPerWrite_GemmM, a_data_per_thread_copy_gemmm); + + // calculate blockwise copy thread cluster lengths + ClusterLengths_GemmK = GemmKPerBlock / a_data_per_thread_copy_gemmk; + ClusterLengths_GemmM = GemmMPerBlock / a_data_per_thread_copy_gemmm; + + if(!(ClusterLengths_GemmK > 0 && ClusterLengths_GemmM > 0)) + MIOPEN_THROW("invalid performance parameter"); } catch(...) { - return std::make_tuple(-1, -1, false); + return std::make_tuple(-1, -1, -1, -1, false); } - return std::make_tuple(SrcDataPerRead_GemmM, DstDataPerWrite_GemmM, true); + return std::make_tuple(ClusterLengths_GemmK, + ClusterLengths_GemmM, + SrcDataPerRead_GemmM, + DstDataPerWrite_GemmM, + true); } -std::tuple +std::tuple PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformanceParameters( const ConvolutionContext& ctx) const { + int ClusterLengths_GemmK = 0; + int ClusterLengths_GemmN = 0; int SrcDataPerRead_GemmN = amd_buffer_load_max_length(); int DstDataPerWrite_GemmN = amd_lds_write_max_length(); try { + const auto WaveSize = 64; + const auto BlockSize = + GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize; + SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, GemmNPerBlock); // calculate vector length on gemmn dimension @@ -116,9 +144,8 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara // \todo too conversative if(y == 1 && x == 1) { - const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); - const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); - + const auto ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); + const auto wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, ho * wo); } else @@ -127,7 +154,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara } // calculate threadwise copy size - int b_data_per_thread_copy = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; + int b_data_per_thread_copy = (GemmKPerBlock * GemmNPerBlock) / BlockSize; if(!(b_data_per_thread_copy > 0)) MIOPEN_THROW("invalid performance parameter"); @@ -135,16 +162,85 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara // GemmBBlockCopySrcDataPerRead_GemmN also bounded by size of threadwise copy SrcDataPerRead_GemmN = gcd(SrcDataPerRead_GemmN, b_data_per_thread_copy); + const auto b_data_per_thread_copy_gemmn = SrcDataPerRead_GemmN; + const auto b_data_per_thread_copy_gemmk = + b_data_per_thread_copy / b_data_per_thread_copy_gemmn; + // GemmBBlockCopyDstDataPerWrite_GemmN also bounded by size of threadwise copy - DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, SrcDataPerRead_GemmN); + DstDataPerWrite_GemmN = gcd(DstDataPerWrite_GemmN, b_data_per_thread_copy_gemmn); + + // calculate blockwise copy thread cluster lengths + ClusterLengths_GemmK = GemmKPerBlock / b_data_per_thread_copy_gemmk; + ClusterLengths_GemmN = GemmNPerBlock / b_data_per_thread_copy_gemmn; + + if(!(ClusterLengths_GemmK > 0 && ClusterLengths_GemmN > 0)) + MIOPEN_THROW("invalid performance parameter"); } catch(...) { MIOPEN_LOG_I("catch"); - return std::make_tuple(-1, -1, false); + return std::make_tuple(-1, -1, -1, -1, false); } - return std::make_tuple(SrcDataPerRead_GemmN, DstDataPerWrite_GemmN, true); + return std::make_tuple(ClusterLengths_GemmK, + ClusterLengths_GemmN, + SrcDataPerRead_GemmN, + DstDataPerWrite_GemmN, + true); +} + +std::tuple PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateLdsNumberOfByte( + const ConvolutionContext& ctx) const +{ + std::size_t lds_size = 0; + + try + { + bool valid = false; + + int GemmABlockCopyClusterLengths_GemmM = 0; + int GemmABlockCopyDescDataPerWriteGemmM = 0; + std::tie(std::ignore, + GemmABlockCopyClusterLengths_GemmM, + std::ignore, + GemmABlockCopyDescDataPerWriteGemmM, + valid) = CalculateGemmABlockCopyPerformanceParameters(ctx); + + if(!valid) + MIOPEN_THROW("invalid performance parameter"); + + int GemmBBlockCopyClusterLengths_GemmN = 0; + int GemmBBlockCopyDescDataPerWriteGemmN = 0; + std::tie(std::ignore, + GemmBBlockCopyClusterLengths_GemmN, + std::ignore, + GemmBBlockCopyDescDataPerWriteGemmN, + valid) = CalculateGemmBBlockCopyPerformanceParameters(ctx); + + if(!valid) + MIOPEN_THROW("invalid performance parameter"); + + const auto ThreadGemmDataPerRead_GemmM = GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; + const auto ThreadGemmDataPerRead_GemmN = GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; + + const auto max_lds_align = lcm(GemmABlockCopyDescDataPerWriteGemmM, + GemmBBlockCopyDescDataPerWriteGemmN, + ThreadGemmDataPerRead_GemmM, + ThreadGemmDataPerRead_GemmN); + + const auto a_block_space = + GemmKPerBlock * integer_least_multiple(GemmMPerBlock, max_lds_align); + const auto b_block_space = + GemmKPerBlock * integer_least_multiple(GemmNPerBlock, max_lds_align); + + lds_size = 2 * (a_block_space + b_block_space) * sizeof(float); + } + catch(...) + { + return std::make_tuple(0, false); + } + + return std::make_tuple(lds_size, true); } bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx) const @@ -165,54 +261,45 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& GemmK % (GemmKPerBlock * GemmKBlocks) == 0)) return false; // wrong! cannot divice N evenly among thread } + // heuristic to reduce search space + { + // use largest XdlopsGemm + if(GemmMPerBlock >= 64 && GemmMPerWave != 64) + return false; + if(GemmNPerBlock >= 64 && GemmNPerWave != 64) + return false; + if((GemmMPerBlock == 32 || GemmMPerBlock == 16) && GemmMPerWave != GemmMPerBlock) + return false; + if((GemmNPerBlock == 32 || GemmNPerBlock == 16) && GemmNPerWave != GemmNPerBlock) + return false; + } - if(!(GemmKPerBlock % GemmBBlockCopyClusterLengths_GemmK == 0 && - GemmKPerBlock % GemmABlockCopyClusterLengths_GemmK == 0 && - GemmNPerBlock % GemmBBlockCopyClusterLengths_GemmN == 0 && - GemmMPerBlock % GemmABlockCopyClusterLengths_GemmM == 0)) - return false; + if(!(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 && GemmK % GemmKPerBlock == 0)) + return false; // wrong! cannot divice N evenly among thread - // unsupported xdlops-gemm - if(GemmMPerWave == 16 && GemmNPerWave == 32) - return false; - if(GemmMPerWave == 32 && GemmNPerWave == 16) - return false; - if(GemmMPerWave == 8 && GemmNPerWave != 64) - return false; - if(GemmMPerWave == 4 && GemmNPerWave != 64) + if(!IsValidXdlopsGemm(GemmMPerBlock, GemmNPerBlock, GemmKPerBlock, GemmMPerWave, GemmNPerWave)) return false; - const auto WaveSize = 64; - const auto BlockSize = GemmNPerBlock * GemmMPerBlock / (GemmMPerWave * GemmNPerWave) * WaveSize; + bool valid = false; - // fail with blockSize >= 512 - /// \todo fix the issue with blockSize >= 512 - if(BlockSize < 64 || BlockSize > 256) - return false; + // check blockwise copy of A matrix + std::tie(std::ignore, std::ignore, std::ignore, std::ignore, valid) = + CalculateGemmABlockCopyPerformanceParameters(ctx); - if(BlockSize != GemmBBlockCopyClusterLengths_GemmK * GemmBBlockCopyClusterLengths_GemmN) + if(!valid) return false; - if(BlockSize != GemmABlockCopyClusterLengths_GemmM * GemmABlockCopyClusterLengths_GemmK) - return false; + // check blockwise copy of B matrix + std::tie(std::ignore, std::ignore, std::ignore, std::ignore, valid) = + CalculateGemmBBlockCopyPerformanceParameters(ctx); - if((GemmMPerBlock % GemmMPerWave) != 0 || (GemmNPerBlock % GemmNPerWave) != 0) + if(!valid) return false; - const auto GemmBBlockCopyThreadSliceLengths_GemmN = - GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; - const auto GemmABlockCopyThreadSliceLengths_GemmM = - GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; - const auto lds_size = ComputeLDSRequiredSize(ctx, - GemmNPerBlock, - GemmMPerBlock, - GemmKPerBlock, - 1, - 1, - GemmBBlockCopyThreadSliceLengths_GemmN, - GemmABlockCopyThreadSliceLengths_GemmM, - GetEPackLength(ctx, true)); - return lds_size <= 64 * 1024; + std::size_t lds_size = 0; + std::tie(lds_size, valid) = CalculateLdsNumberOfByte(ctx); + + return (valid and lds_size <= 64 * 1024); } PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare) @@ -224,12 +311,6 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo GemmMPerWave = spare ? 4 : 64; GemmNPerWave = spare ? 16 : 64; - GemmBBlockCopyClusterLengths_GemmK = 4; - GemmBBlockCopyClusterLengths_GemmN = 4; - - GemmABlockCopyClusterLengths_GemmK = 2; - GemmABlockCopyClusterLengths_GemmM = 4; - use_spare_set = spare; } @@ -239,20 +320,12 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::PerformanceImplicitGemmBwdDataV4R1Xdlo int GemmKPerBlock_, int GemmMPerWave_, int GemmNPerWave_, - int GemmBBlockCopyClusterLengths_GemmK_, - int GemmBBlockCopyClusterLengths_GemmN_, - int GemmABlockCopyClusterLengths_GemmK_, - int GemmABlockCopyClusterLengths_GemmM_, bool use_spare_set_) : GemmNPerBlock(GemmNPerBlock_), GemmMPerBlock(GemmMPerBlock_), GemmKPerBlock(GemmKPerBlock_), GemmMPerWave(GemmMPerWave_), GemmNPerWave(GemmNPerWave_), - GemmBBlockCopyClusterLengths_GemmK(GemmBBlockCopyClusterLengths_GemmK_), - GemmBBlockCopyClusterLengths_GemmN(GemmBBlockCopyClusterLengths_GemmN_), - GemmABlockCopyClusterLengths_GemmK(GemmABlockCopyClusterLengths_GemmK_), - GemmABlockCopyClusterLengths_GemmM(GemmABlockCopyClusterLengths_GemmM_), use_spare_set(use_spare_set_) { } @@ -266,10 +339,6 @@ operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const && GemmKPerBlock == other.GemmKPerBlock && GemmMPerWave == other.GemmMPerWave && GemmNPerWave == other.GemmNPerWave - && GemmBBlockCopyClusterLengths_GemmK == other.GemmBBlockCopyClusterLengths_GemmK - && GemmBBlockCopyClusterLengths_GemmN == other.GemmBBlockCopyClusterLengths_GemmN - && GemmABlockCopyClusterLengths_GemmK == other.GemmABlockCopyClusterLengths_GemmK - && GemmABlockCopyClusterLengths_GemmM == other.GemmABlockCopyClusterLengths_GemmM && use_spare_set == other.use_spare_set; // clang-format on } @@ -281,11 +350,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValidValue() const && IsTwoPower<4,128>(GemmMPerBlock) && IsTwoPower<4,32>(GemmKPerBlock) && IsTwoPower<4,64>(GemmMPerWave) - && IsTwoPower<16,64>(GemmNPerWave) - && IsTwoPower<4,16>(GemmBBlockCopyClusterLengths_GemmK) - && IsTwoPower<4,64>(GemmBBlockCopyClusterLengths_GemmN) - && IsTwoPower<2,16>(GemmABlockCopyClusterLengths_GemmK) - && IsTwoPower<4,128>(GemmABlockCopyClusterLengths_GemmM); // clang-format on + && IsTwoPower<16,64>(GemmNPerWave); // clang-format on } bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() @@ -314,6 +379,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() if(!NextTwoPower<16, 64>(GemmNPerWave)) break; } +#if 0 if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK)) break; if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN)) @@ -322,6 +388,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() break; if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM)) break; +#endif return false; } while(false); @@ -331,25 +398,23 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() void PerformanceImplicitGemmBwdDataV4R1Xdlops::EuristicInit(const ConvolutionContext& ctx) { PerformanceImplicitGemmBwdDataV4R1Xdlops tmp; - tmp = {128, 128, 8, 64, 64, 4, 64, 4, 64, use_spare_set}; + tmp = {128, 128, 8, 64, 64, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {64, 32, 4, 32, 64, 4, 16, 2, 32, use_spare_set}; + tmp = {64, 32, 4, 32, 64, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {64, 32, 4, 32, 64, 4, 16, 4, 16, use_spare_set}; + tmp = {32, 64, 4, 64, 32, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {32, 64, 4, 64, 32, 4, 16, 4, 16, use_spare_set}; + tmp = {32, 32, 4, 32, 32, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {32, 32, 4, 32, 32, 4, 16, 2, 32, use_spare_set}; + tmp = {64, 16, 4, 16, 64, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {64, 16, 4, 16, 64, 4, 16, 4, 16, use_spare_set}; + tmp = {16, 64, 4, 64, 16, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {16, 64, 4, 64, 16, 4, 16, 4, 16, use_spare_set}; + tmp = {16, 16, 4, 16, 16, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {16, 16, 4, 16, 16, 4, 16, 4, 16, use_spare_set}; + tmp = {64, 4, 16, 4, 64, use_spare_set}; if(!tmp.IsValid(ctx)) - tmp = {64, 4, 16, 4, 64, 16, 4, 16, 4, use_spare_set}; - if(!tmp.IsValid(ctx)) - tmp = {64, 8, 8, 8, 64, 4, 16, 8, 8, use_spare_set}; + tmp = {64, 8, 8, 8, 64, use_spare_set}; if(!tmp.IsValid(ctx)) { MIOPEN_LOG_E("All attempts failed"); @@ -608,16 +673,24 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( // TODO: add fp16 calculation by GetWorkspaceSize(ctx); result.workspce_sz = 0; - std::size_t GemmABlockCopySrcDataPerRead_GemmM = 1; - std::size_t GemmABlockCopyDstDataPerWrite_GemmM = 1; - std::size_t GemmBBlockCopySrcDataPerRead_GemmN = 1; - std::size_t GemmBBlockCopyDstDataPerWrite_GemmN = 1; - - std::tie(GemmABlockCopySrcDataPerRead_GemmM, + int GemmABlockCopySrcDataPerRead_GemmM = 1; + int GemmABlockCopyDstDataPerWrite_GemmM = 1; + int GemmBBlockCopySrcDataPerRead_GemmN = 1; + int GemmBBlockCopyDstDataPerWrite_GemmN = 1; + int GemmABlockCopyClusterLengths_GemmK = 0; + int GemmABlockCopyClusterLengths_GemmM = 0; + int GemmBBlockCopyClusterLengths_GemmK = 0; + int GemmBBlockCopyClusterLengths_GemmN = 0; + + std::tie(GemmABlockCopyClusterLengths_GemmK, + GemmABlockCopyClusterLengths_GemmM, + GemmABlockCopySrcDataPerRead_GemmM, GemmABlockCopyDstDataPerWrite_GemmM, std::ignore) = config.CalculateGemmABlockCopyPerformanceParameters(ctx); - std::tie(GemmBBlockCopySrcDataPerRead_GemmN, + std::tie(GemmBBlockCopyClusterLengths_GemmK, + GemmBBlockCopyClusterLengths_GemmN, + GemmBBlockCopySrcDataPerRead_GemmN, GemmBBlockCopyDstDataPerWrite_GemmN, std::ignore) = config.CalculateGemmBBlockCopyPerformanceParameters(ctx); @@ -648,12 +721,12 @@ ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( std::string(" -DCK_PARAM_TUNABLE_GEMM_K_PER_BLOCK=") + std::to_string(GemmKPerBlock) + std::string(" -DCK_PARAM_GEMM_M_PER_WAVE=") + std::to_string(GemmMPerWave) + std::string(" -DCK_PARAM_GEMM_N_PER_WAVE=") + std::to_string(GemmNPerWave) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmK) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(config.GemmABlockCopyClusterLengths_GemmM) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(GemmABlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M=") + std::to_string(GemmABlockCopyClusterLengths_GemmM) + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_M=") + std::to_string(GemmABlockCopySrcDataPerRead_GemmM ) + std::string(" -DCK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M=") + std::to_string(GemmABlockCopyDstDataPerWrite_GemmM) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmK) + - std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(config.GemmBBlockCopyClusterLengths_GemmN) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K=") + std::to_string(GemmBBlockCopyClusterLengths_GemmK) + + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N=") + std::to_string(GemmBBlockCopyClusterLengths_GemmN) + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N=") + std::to_string(GemmBBlockCopySrcDataPerRead_GemmN ) + std::string(" -DCK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N=") + std::to_string(GemmBBlockCopyDstDataPerWrite_GemmN) + std::string(" -DCK_PARAM_DEPENDENT_GRID_SIZE=") + std::to_string(grid_size) + From 515105047602cf8956e19f901a6d1ed5c456aa23 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Fri, 8 May 2020 17:19:02 +0800 Subject: [PATCH 16/18] clang format for src/solver.cpp --- src/solver.cpp | 1 - 1 file changed, 1 deletion(-) mode change 100755 => 100644 src/solver.cpp diff --git a/src/solver.cpp b/src/solver.cpp old mode 100755 new mode 100644 index 45f050f4d1..ca0cf35916 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -315,7 +315,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) RegisterWithSolver( registry, ++id, ConvHipImplicitGemmBwdDataV4R1Xdlops{}, miopenConvolutionAlgoImplicitGEMM); - } } // namespace solver From d068e9affb68f22e3a58f0d3544115ce6dc8a566 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Sat, 9 May 2020 10:19:18 +0800 Subject: [PATCH 17/18] remove non-used code --- src/include/miopen/solver.hpp | 12 ------------ .../conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 10 ---------- 2 files changed, 22 deletions(-) mode change 100644 => 100755 src/include/miopen/solver.hpp mode change 100644 => 100755 src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp old mode 100644 new mode 100755 index d17cfae274..40bae8c223 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -677,13 +677,7 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops int GemmMPerWave; int GemmNPerWave; -#if 0 - int GemmBBlockCopyClusterLengths_GemmK; // 2^n[4..16] - int GemmBBlockCopyClusterLengths_GemmN; // 2^n[8..64] - int GemmABlockCopyClusterLengths_GemmK; // 2^n[1..4] - int GemmABlockCopyClusterLengths_GemmM; // 2^n[16..128] -#endif bool use_spare_set; PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, bool); @@ -710,12 +704,6 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops f(self.GemmKPerBlock, "GemmKPerBlock"); f(self.GemmMPerWave, "GemmMPerWave"); f(self.GemmNPerWave, "GemmNPerWave"); -#if 0 - f(self.GemmBBlockCopyClusterLengths_GemmK, "GemmBBlockCopyClusterLengths_GemmK"); - f(self.GemmBBlockCopyClusterLengths_GemmN, "GemmBBlockCopyClusterLengths_GemmN"); - f(self.GemmABlockCopyClusterLengths_GemmK, "GemmABlockCopyClusterLengths_GemmK"); - f(self.GemmABlockCopyClusterLengths_GemmM, "GemmABlockCopyClusterLengths_GemmM"); -#endif } std::tuple CalculateGridSize(const ConvolutionContext& ctx) const; diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp old mode 100644 new mode 100755 index 0273e418b5..439a634775 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -379,16 +379,6 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue() if(!NextTwoPower<16, 64>(GemmNPerWave)) break; } -#if 0 - if(!NextTwoPower<4, 16>(GemmBBlockCopyClusterLengths_GemmK)) - break; - if(!NextTwoPower<4, 64>(GemmBBlockCopyClusterLengths_GemmN)) - break; - if(!NextTwoPower<2, 16>(GemmABlockCopyClusterLengths_GemmK)) - break; - if(!NextTwoPower<4, 128>(GemmABlockCopyClusterLengths_GemmM)) - break; -#endif return false; } while(false); From abf7fd5b770865500512a05e6f39d2055d4e2c50 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 13 May 2020 11:40:28 +0000 Subject: [PATCH 18/18] remove workaround issues for v4r1 xdlops kernel --- src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp index 439a634775..08855f1780 100755 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp @@ -505,10 +505,6 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ConvolutionContext bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx) const { -#if WORKAROUND_SWDEV_229277_227616_229195 - if(!IsHccCompiler()) - return false; -#endif bool is_applicable = true; if(!ctx.direction.IsBackwardData())