diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index de3427c33d3..ff1367a9c1b 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -28,6 +28,60 @@ using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*; namespace ck_tile { +// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the +// memory to the SGPR registers. +__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +__device__ inline int32_t amd_wave_read_first_lane(int32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +template , int> = 0> +__device__ inline auto amd_wave_read_first_lane(const Object& obj) +{ + constexpr size_t ObjectSize = sizeof(Object); + constexpr size_t SGPR_size = 4; + constexpr size_t NumFull = ObjectSize / SGPR_size; + constexpr size_t Tail = ObjectSize % SGPR_size; + + const unsigned char* src = reinterpret_cast(&obj); + alignas(Object) unsigned char dst[ObjectSize]; + + static_for<0, NumFull, 1>{}([&](auto Ic) { + constexpr size_t offset = Ic * SGPR_size; + uint32_t read_src; + __builtin_memcpy(&read_src, src + offset, SGPR_size); + read_src = __builtin_amdgcn_readfirstlane(read_src); + __builtin_memcpy(dst + offset, &read_src, SGPR_size); + }); + + if constexpr(Tail != 0) + { + constexpr size_t offset = NumFull * SGPR_size; + uint32_t tail_loc = 0; + __builtin_memcpy(&tail_loc, src + offset, Tail); + tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); + __builtin_memcpy(dst + offset, &tail_loc, Tail); + } + Object out; + __builtin_memcpy(&out, dst, ObjectSize); + return out; +} + // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions struct __attribute__((packed)) buffer_resource @@ -37,10 +91,17 @@ struct __attribute__((packed)) buffer_resource uint32_t config; }; -CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff) +template +CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, + uint32_t size = 0xffffffff, + ForceSGPR = {}) { buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD}; int32x4_t r = __builtin_bit_cast(int32x4_t, res); + if constexpr(std::is_same_v) + { + r = amd_wave_read_first_lane(r); + } return r; } @@ -2829,60 +2890,6 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) } #endif -// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the -// memory to the SGPR registers. -__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -__device__ inline int32_t amd_wave_read_first_lane(int32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -template , int> = 0> -__device__ inline auto amd_wave_read_first_lane(const Object& obj) -{ - constexpr size_t ObjectSize = sizeof(Object); - constexpr size_t SGPR_size = 4; - constexpr size_t NumFull = ObjectSize / SGPR_size; - constexpr size_t Tail = ObjectSize % SGPR_size; - - const unsigned char* src = reinterpret_cast(&obj); - alignas(Object) unsigned char dst[ObjectSize]; - - static_for<0, NumFull, 1>{}([&](auto Ic) { - constexpr size_t offset = Ic * SGPR_size; - uint32_t read_src; - __builtin_memcpy(&read_src, src + offset, SGPR_size); - read_src = __builtin_amdgcn_readfirstlane(read_src); - __builtin_memcpy(dst + offset, &read_src, SGPR_size); - }); - - if constexpr(Tail != 0) - { - constexpr size_t offset = NumFull * SGPR_size; - uint32_t tail_loc = 0; - __builtin_memcpy(&tail_loc, src + offset, Tail); - tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); - __builtin_memcpy(dst + offset, &tail_loc, Tail); - } - Object out; - __builtin_memcpy(&out, dst, ObjectSize); - return out; -} - } // namespace ck_tile #endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index ce5a8075df4..38e033cd92d 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -19,6 +19,60 @@ using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*; namespace ck_tile { +// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the +// memory to the SGPR registers. +__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +__device__ inline int32_t amd_wave_read_first_lane(int32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +template , int> = 0> +__device__ inline auto amd_wave_read_first_lane(const Object& obj) +{ + constexpr size_t ObjectSize = sizeof(Object); + constexpr size_t SGPR_size = 4; + constexpr size_t NumFull = ObjectSize / SGPR_size; + constexpr size_t Tail = ObjectSize % SGPR_size; + + const unsigned char* src = reinterpret_cast(&obj); + alignas(Object) unsigned char dst[ObjectSize]; + + static_for<0, NumFull, 1>{}([&](auto Ic) { + constexpr size_t offset = Ic * SGPR_size; + uint32_t read_src; + __builtin_memcpy(&read_src, src + offset, SGPR_size); + read_src = __builtin_amdgcn_readfirstlane(read_src); + __builtin_memcpy(dst + offset, &read_src, SGPR_size); + }); + + if constexpr(Tail != 0) + { + constexpr size_t offset = NumFull * SGPR_size; + uint32_t tail_loc = 0; + __builtin_memcpy(&tail_loc, src + offset, Tail); + tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); + __builtin_memcpy(dst + offset, &tail_loc, Tail); + } + Object out; + __builtin_memcpy(&out, dst, ObjectSize); + return out; +} + // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions struct __attribute__((packed)) buffer_resource @@ -28,10 +82,17 @@ struct __attribute__((packed)) buffer_resource uint32_t config; }; -CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff) +template +CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, + uint32_t size = 0xffffffff, + ForceSGPR = {}) { buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD}; int32x4_t r = __builtin_bit_cast(int32x4_t, res); + if constexpr(std::is_same_v) + { + r = amd_wave_read_first_lane(r); + } return r; } @@ -2570,60 +2631,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ #endif } -// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the -// memory to the SGPR registers. -__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -__device__ inline int32_t amd_wave_read_first_lane(int32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -template , int> = 0> -__device__ inline auto amd_wave_read_first_lane(const Object& obj) -{ - constexpr size_t ObjectSize = sizeof(Object); - constexpr size_t SGPR_size = 4; - constexpr size_t NumFull = ObjectSize / SGPR_size; - constexpr size_t Tail = ObjectSize % SGPR_size; - - const unsigned char* src = reinterpret_cast(&obj); - alignas(Object) unsigned char dst[ObjectSize]; - - static_for<0, NumFull, 1>{}([&](auto Ic) { - constexpr size_t offset = Ic * SGPR_size; - uint32_t read_src; - __builtin_memcpy(&read_src, src + offset, SGPR_size); - read_src = __builtin_amdgcn_readfirstlane(read_src); - __builtin_memcpy(dst + offset, &read_src, SGPR_size); - }); - - if constexpr(Tail != 0) - { - constexpr size_t offset = NumFull * SGPR_size; - uint32_t tail_loc = 0; - __builtin_memcpy(&tail_loc, src + offset, Tail); - tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); - __builtin_memcpy(dst + offset, &tail_loc, Tail); - } - Object out; - __builtin_memcpy(&out, dst, ObjectSize); - return out; -} - template CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, const index_t global_offset, diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp index d19f0894b9e..3f0dbfb340d 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp @@ -210,7 +210,8 @@ struct FusedMoeGemmPipeline_FlatmmUk auto a_res = make_wave_buffer_resource(reinterpret_cast(kargs.a_ptr), - kargs.num_tokens * kargs.stride_token * sizeof(ADataType)); + kargs.num_tokens * kargs.stride_token * sizeof(ADataType), + std::true_type{}); auto make_gu_win = [&](const auto* ptr_) { auto view_ = make_naive_tensor_view( @@ -322,7 +323,8 @@ struct FusedMoeGemmPipeline_FlatmmUk auto o_res = make_wave_buffer_resource(reinterpret_cast(kargs.o_ptr), - kargs.num_tokens * kargs.stride_token * sizeof(ODataType)); + kargs.num_tokens * kargs.stride_token * sizeof(ODataType), + std::true_type{}); auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0); auto w_scale = GetWeightScale( row_coords_o, reinterpret_cast(kargs.sorted_weight_ptr));