From 55dd2c844c3a3075cddf6abfddb698e73a27d569 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 13 Oct 2021 10:37:24 -0500 Subject: [PATCH 01/16] start fixing 16bit data packing --- .../threadwise_tensor_slice_transfer.hpp | 582 ++++++++++++++++++ composable_kernel/include/utility/config.hpp | 2 +- .../src/gemm_driver_offline.cpp | 6 +- 3 files changed, 586 insertions(+), 4 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index 157828bf0fc..1f3dc8fe7ea 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -670,6 +670,587 @@ struct ThreadwiseTensorSliceTransfer_v2 SrcCoord src_coord_; }; // namespace ck +#if 0 +// Assume: +// 1. src_desc and dst_desc are not known at compile-time +// 2. SrcBuffer and DstBuffer are DynamicBuffer +// 3. src_slice_origin and dst_slice_origin are not known at compile-time, +// 4. Use thread buffer +template // control whether to move back dst coordinate after each + // RunWrite(), will be fused with MoveDstSliceWindow to + // save addr computation +struct ThreadwiseTensorSliceTransfer_v3 +{ + static constexpr index_t nDim = SliceLengths::Size(); + using Index = MultiIndex; + + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + + using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + + __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc, + const Index& src_slice_origin, + const DstDesc& dst_desc, + const Index& dst_slice_origin) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), + dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) + { + } + + __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) + { + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); + } + + __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) + { + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); + } + + template + __device__ void + RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) + { + static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or + SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer and SrcData data type are inconsistent"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // make forward steps + const auto src_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, forward_step_idx, src_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto src_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, backward_step_idx, src_step_hacks[I1][i]); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_src_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_idx[I0]; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] + : ordered_src_access_lengths[i] - 1 - + ordered_src_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + vector_type_maker_t src_tmp_vector; + + using src_vector_t = typename decltype(src_tmp_vector)::type; + + const bool is_src_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); + + // copy data from src_buf to src_tmp_vector + src_tmp_vector.template AsType()(Number<0>{}) = + src_buf.template Get(src_coord_.GetOffset(), is_src_valid); + + // copy data from src_tmp_vector to buffer_ + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { + constexpr index_t buffer_offset = + buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector); + + buffer_(Number{}) = src_tmp_vector.template AsType()[i]; + }); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); + } + } + }); + }); + + // move src coordinate back to slice origin (or not) + if constexpr(SrcResetCoordinateAfterRun) + { + const auto src_reset_step = + make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); + + move_tensor_coordinate(src_desc, src_coord_, src_reset_step); + } + } + + template + __device__ void + RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) + { + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or + DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + // src scalar per access on each dim + // TODO: don't use this + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // make forward steps + const auto dst_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, forward_step_idx, dst_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto dst_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, backward_step_idx, dst_step_hacks[I1][i]); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_dst_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_idx[I0]; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] + : ordered_dst_access_lengths[i] - 1 - + ordered_dst_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + vector_type_maker_t dst_tmp_vector; + + // copy data from buffer_ to dst_tmp_vector + static_for<0, DstScalarPerVector, 1>{}([&](auto i) { + constexpr index_t buffer_offset = + buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); + + dst_tmp_vector.template AsType()(i) = + type_convert{}(buffer_[Number{}]); + }); + + using dst_vector_t = typename decltype(dst_tmp_vector)::type; + + // copy data from dst_tmp_vector to dst_buf + const bool is_dst_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); + + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_tmp_vector.template AsType()[Number<0>{}]); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); + } + } + }); + }); + + // move dst coordinate back to slice origin (or not) + if constexpr(DstResetCoordinateAfterRun) + { + const auto dst_reset_step = + make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); + + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); + } + } + + template + __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) + { + constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto src_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + RunRead(src_desc, src_buf, src_step_hacks); + } + + template + __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) + { + constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto dst_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + RunWrite(dst_desc, dst_buf, dst_step_hacks); + } + + __device__ static constexpr auto GetSrcCoordinateResetStep() + { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_lengths[I0] - 1; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index after last iteration in RunRead(), if it has not being reset by + // RunRead() + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + // + constexpr auto reset_src_data_step = [&]() { + Index reset_src_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); + + return reset_src_data_step_; + }(); + + return reset_src_data_step; + } + + __device__ static constexpr auto GetDstCoordinateResetStep() + { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_lengths[I0] - 1; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index after last iteration in RunWrite(), if it has not being reset by + // RunWrite() + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + // + constexpr auto reset_dst_data_step = [&]() { + Index reset_dst_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); + + return reset_dst_data_step_; + }(); + + return reset_dst_data_step; + } + + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + template + __device__ void + MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx, + const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step( + src_desc, adjusted_step_idx, src_move_slice_window_step_hack); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + // dst_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, + const Index& dst_slice_origin_step_idx) + { + // if dst coord was not reset by RunWrite(), then need to adjust the step here + const auto adjusted_step_idx = + DstResetCoordinateAfterRun ? dst_slice_origin_step_idx + : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); + + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + } + + private: + static constexpr auto buffer_desc_ = + make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); + + static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); + + StaticBuffer buffer_; + + SrcCoord src_coord_; + DstCoord dst_coord_; +}; +#else // Assume: // 1. src_desc and dst_desc are not known at compile-time // 2. SrcBuffer and DstBuffer are DynamicBuffer @@ -1249,6 +1830,7 @@ struct ThreadwiseTensorSliceTransfer_v3 SrcCoord src_coord_; DstCoord dst_coord_; }; +#endif // Assume: // 1. src: diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp index 5ee4bb9c642..62f92d1d5a4 100644 --- a/composable_kernel/include/utility/config.hpp +++ b/composable_kernel/include/utility/config.hpp @@ -94,7 +94,7 @@ #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 // merge transformation use magic number division -#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 +#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1 // hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be diff --git a/host/driver_offline/src/gemm_driver_offline.cpp b/host/driver_offline/src/gemm_driver_offline.cpp index e60b4905ae7..fec8170422c 100644 --- a/host/driver_offline/src/gemm_driver_offline.cpp +++ b/host/driver_offline/src/gemm_driver_offline.cpp @@ -22,10 +22,10 @@ #include "device_gemm_xdlops_km_kn_nm.hpp" #include "device_gemm_xdlops_km_nk_nm.hpp" -#define USE_GEMM_XDL_MK_KN_MN 1 -#define USE_GEMM_XDL_MK_NK_MN 1 +#define USE_GEMM_XDL_MK_KN_MN 0 +#define USE_GEMM_XDL_MK_NK_MN 0 #define USE_GEMM_XDL_KM_KN_MN 1 -#define USE_GEMM_XDL_KM_NK_MN 1 +#define USE_GEMM_XDL_KM_NK_MN 0 #define USE_GEMM_XDL_MK_KN_NM 0 #define USE_GEMM_XDL_MK_NK_NM 0 #define USE_GEMM_XDL_KM_KN_NM 0 From f513f88740f547f07abc70dcb52b1a71ac880e34 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 16 Oct 2021 12:56:56 -0500 Subject: [PATCH 02/16] adding StaticTensor --- .../is_known_at_compile_time.hpp | 49 ++ .../tensor_description/static_tensor.hpp | 105 +++ .../blockwise_tensor_slice_transfer.hpp | 35 +- .../threadwise_tensor_slice_transfer.hpp | 582 ----------------- .../threadwise_tensor_slice_transfer_v3r2.hpp | 607 ++++++++++++++++++ .../include/utility/common_header.hpp | 1 + .../include/utility/container_helper.hpp | 13 - composable_kernel/include/utility/ignore.hpp | 21 + .../include/utility/tuple_helper.hpp | 16 - composable_kernel/include/utility/type.hpp | 15 - 10 files changed, 801 insertions(+), 643 deletions(-) create mode 100644 composable_kernel/include/tensor_description/is_known_at_compile_time.hpp create mode 100644 composable_kernel/include/tensor_description/static_tensor.hpp create mode 100644 composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp create mode 100644 composable_kernel/include/utility/ignore.hpp diff --git a/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp b/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp new file mode 100644 index 00000000000..9dbe22f2eea --- /dev/null +++ b/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp @@ -0,0 +1,49 @@ +#ifndef IS_KNOWN_AT_COMPILE_TIME_HPP +#define IS_KNOWN_AT_COMPILE_TIME_HPP + +#include "config.hpp" +#include "integral_constant.hpp" +#include "sequence.hpp" +#include "tuple.hpp" + +namespace ck { + +template +struct is_known_at_compile_time; + +template <> +struct is_known_at_compile_time +{ + static constexpr bool value = false; +}; + +template +struct is_known_at_compile_time> +{ + static constexpr bool value = true; +}; + +template +struct is_known_at_compile_time> +{ + static constexpr bool value = true; +}; + +template +struct is_known_at_compile_time> +{ + __host__ __device__ static constexpr bool IsKnownAtCompileTime() + { + return container_reduce( + Tuple{}, + [](auto x, bool r) { + return is_known_at_compile_time>::value & r; + }, + true); + } + + static constexpr bool value = IsKnownAtCompileTime(); +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp new file mode 100644 index 00000000000..bdf98b7b702 --- /dev/null +++ b/composable_kernel/include/tensor_description/static_tensor.hpp @@ -0,0 +1,105 @@ +#ifndef CK_STATIC_TENSOR_HPP +#define CK_STATIC_TENSOR_HPP + +#include "ignore.hpp" +#include "static_buffer.hpp" + +namespace ck { + +template ::type = false> +struct StaticTensor +{ + static constexpr index_t NDim = TensorDesc::GetNumOfDimension(); + static constexpr index_t buffer_size_ = TensorDesc{}.GetElementSpaceSize(); + + static constexpr auto desc_ = TensorDesc{}; + + using Buffer = StaticBuffer; + + __host__ __device__ constexpr StaticTensor() : invalid_element_value_{0} {} + + __host__ __device__ constexpr StaticTensor(T invalid_element_value) + : invalid_element_value_{invalid_element_value} + { + } + + template ::value && Idx::Size() == NDim, + bool>::type = false> + __host__ __device__ constexpr const T& operator[](Idx) const + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + return buffer_[Number{}]; + } + else + { + if constexpr(InvalidElementUseNumericalZeroValue) + { + return T{0}; + } + else + { + return invalid_element_value_; + } + } + } + + template ::value && Idx::Size() == NDim, + bool>::type = false> + __host__ __device__ T& operator()(Idx) + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + return buffer_(Number{}); + } + else + { + return ignore; + } + } + + Buffer buffer_; + T invalid_element_value_ = T{0}; +}; + +template ::type = false> +__host__ __device__ constexpr auto make_static_tensor(TensorDesc) +{ + return StaticTensor{}; +} + +template < + AddressSpaceEnum_t AddressSpace, + typename T, + typename TensorDesc, + typename X, + typename enable_if::type = false, + typename enable_if, remove_cvref_t>::value, bool>::type = false> +__host__ __device__ constexpr auto make_static_tensor(TensorDesc, X invalid_element_value) +{ + return StaticTensor{invalid_element_value}; +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp index 0214b713522..e0330853a74 100644 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp @@ -5,7 +5,8 @@ #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" #include "cluster_descriptor.hpp" -#include "threadwise_tensor_slice_transfer.hpp" +//#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer_v3r2.hpp" namespace ck { @@ -146,22 +147,22 @@ struct BlockwiseTensorSliceTransfer_v4 make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); using ThreadwiseTransfer = - ThreadwiseTensorSliceTransfer_v3; + ThreadwiseTensorSliceTransfer_v3r2; ThreadwiseTransfer threadwise_transfer_; }; diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index 1f3dc8fe7ea..157828bf0fc 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -670,587 +670,6 @@ struct ThreadwiseTensorSliceTransfer_v2 SrcCoord src_coord_; }; // namespace ck -#if 0 -// Assume: -// 1. src_desc and dst_desc are not known at compile-time -// 2. SrcBuffer and DstBuffer are DynamicBuffer -// 3. src_slice_origin and dst_slice_origin are not known at compile-time, -// 4. Use thread buffer -template // control whether to move back dst coordinate after each - // RunWrite(), will be fused with MoveDstSliceWindow to - // save addr computation -struct ThreadwiseTensorSliceTransfer_v3 -{ - static constexpr index_t nDim = SliceLengths::Size(); - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc, - const Index& src_slice_origin, - const DstDesc& dst_desc, - const Index& dst_slice_origin) - : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), - dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) - { - } - - __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) - { - src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void - RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) - { - static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer and SrcData data type are inconsistent"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto src_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // make forward steps - const auto src_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, forward_step_idx, src_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto src_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, backward_step_idx, src_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_src_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] - : ordered_src_access_lengths[i] - 1 - - ordered_src_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_scalar_per_access; - }(); - - vector_type_maker_t src_tmp_vector; - - using src_vector_t = typename decltype(src_tmp_vector)::type; - - const bool is_src_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); - - // copy data from src_buf to src_tmp_vector - src_tmp_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_coord_.GetOffset(), is_src_valid); - - // copy data from src_tmp_vector to buffer_ - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector); - - buffer_(Number{}) = src_tmp_vector.template AsType()[i]; - }); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); - } - } - }); - }); - - // move src coordinate back to slice origin (or not) - if constexpr(SrcResetCoordinateAfterRun) - { - const auto src_reset_step = - make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); - - move_tensor_coordinate(src_desc, src_coord_, src_reset_step); - } - } - - template - __device__ void - RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) - { - static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // src scalar per access on each dim - // TODO: don't use this - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto dst_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // make forward steps - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_dst_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] - : ordered_dst_access_lengths[i] - 1 - - ordered_dst_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_scalar_per_access; - }(); - - vector_type_maker_t dst_tmp_vector; - - // copy data from buffer_ to dst_tmp_vector - static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); - - dst_tmp_vector.template AsType()(i) = - type_convert{}(buffer_[Number{}]); - }); - - using dst_vector_t = typename decltype(dst_tmp_vector)::type; - - // copy data from dst_tmp_vector to dst_buf - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_tmp_vector.template AsType()[Number<0>{}]); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) - { - constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto src_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunRead(src_desc, src_buf, src_step_hacks); - } - - template - __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) - { - constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto dst_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunWrite(dst_desc, dst_buf, dst_step_hacks); - } - - __device__ static constexpr auto GetSrcCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index after last iteration in RunRead(), if it has not being reset by - // RunRead() - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_scalar_per_access; - }(); - - // - constexpr auto reset_src_data_step = [&]() { - Index reset_src_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); - - return reset_src_data_step_; - }(); - - return reset_src_data_step; - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in RunWrite(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - template - __device__ void - MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx, - const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step( - src_desc, adjusted_step_idx, src_move_slice_window_step_hack); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by RunWrite(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - static constexpr auto buffer_desc_ = - make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); - - static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); - - StaticBuffer buffer_; - - SrcCoord src_coord_; - DstCoord dst_coord_; -}; -#else // Assume: // 1. src_desc and dst_desc are not known at compile-time // 2. SrcBuffer and DstBuffer are DynamicBuffer @@ -1830,7 +1249,6 @@ struct ThreadwiseTensorSliceTransfer_v3 SrcCoord src_coord_; DstCoord dst_coord_; }; -#endif // Assume: // 1. src: diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp new file mode 100644 index 00000000000..f46d99c1102 --- /dev/null +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -0,0 +1,607 @@ +#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP +#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "static_tensor.hpp" + +namespace ck { + +// Assume: +// 1. src_desc and dst_desc are not known at compile-time +// 2. SrcBuffer and DstBuffer are DynamicBuffer +// 3. src_slice_origin and dst_slice_origin are not known at compile-time, +// 4. Use thread buffer +template // control whether to move back dst coordinate after each + // RunWrite(), will be fused with MoveDstSliceWindow to + // save addr computation +struct ThreadwiseTensorSliceTransfer_v3r2 +{ + static constexpr index_t nDim = SliceLengths::Size(); + using Index = MultiIndex; + + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + + using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + + __device__ constexpr ThreadwiseTensorSliceTransfer_v3r2(const SrcDesc& src_desc, + const Index& src_slice_origin, + const DstDesc& dst_desc, + const Index& dst_slice_origin) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), + dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) + { + } + + __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) + { + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); + } + + __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) + { + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); + } + + template + __device__ void + RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) + { + static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or + SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer and SrcData data type are inconsistent"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // make forward steps + const auto src_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, forward_step_idx, src_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto src_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, backward_step_idx, src_step_hacks[I1][i]); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_src_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_idx[I0]; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] + : ordered_src_access_lengths[i] - 1 - + ordered_src_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + +#if 1 // debug + constexpr auto src_data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); +#endif + + vector_type_maker_t src_tmp_vector; + + using src_vector_t = typename decltype(src_tmp_vector)::type; + + const bool is_src_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); + + // copy data from src_buf to src_tmp_vector + src_tmp_vector.template AsType()(Number<0>{}) = + src_buf.template Get(src_coord_.GetOffset(), is_src_valid); + + // copy data from src_tmp_vector to thread_tensor_ + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { +#if 0 // debug + thread_tensor_(src_data_idx + i * src_scalar_step_in_vector) = + src_tmp_vector.template AsType()[i]; +#else + thread_tensor_(src_data_idx_seq + i * src_scalar_step_in_vector) = + src_tmp_vector.template AsType()[i]; +#endif + }); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); + } + } + }); + }); + + // move src coordinate back to slice origin (or not) + if constexpr(SrcResetCoordinateAfterRun) + { + const auto src_reset_step = + make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); + + move_tensor_coordinate(src_desc, src_coord_, src_reset_step); + } + } + + template + __device__ void + RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) + { + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or + DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + // src scalar per access on each dim + // TODO: don't use this + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // make forward steps + const auto dst_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, forward_step_idx, dst_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto dst_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, backward_step_idx, dst_step_hacks[I1][i]); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_dst_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_idx[I0]; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] + : ordered_dst_access_lengths[i] - 1 - + ordered_dst_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + +#if 1 // debug + constexpr auto dst_data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); +#endif + + vector_type_maker_t dst_tmp_vector; + + // copy data from thread_tensor_ to dst_tmp_vector + static_for<0, DstScalarPerVector, 1>{}([&](auto i) { +#if 0 // debug + dst_tmp_vector.template AsType()(i) = type_convert{}( + thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]); +#else + dst_tmp_vector.template AsType()(i) = type_convert{}( + thread_tensor_[dst_data_idx_seq + i * dst_scalar_step_in_vector]); +#endif + }); + + using dst_vector_t = typename decltype(dst_tmp_vector)::type; + + // copy data from dst_tmp_vector to dst_buf + const bool is_dst_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); + + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_tmp_vector.template AsType()[Number<0>{}]); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); + } + } + }); + }); + + // move dst coordinate back to slice origin (or not) + if constexpr(DstResetCoordinateAfterRun) + { + const auto dst_reset_step = + make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); + + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); + } + } + + template + __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) + { + constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto src_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + RunRead(src_desc, src_buf, src_step_hacks); + } + + template + __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) + { + constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto dst_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + RunWrite(dst_desc, dst_buf, dst_step_hacks); + } + + __device__ static constexpr auto GetSrcCoordinateResetStep() + { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_lengths[I0] - 1; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index after last iteration in RunRead(), if it has not being reset by + // RunRead() + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + // + constexpr auto reset_src_data_step = [&]() { + Index reset_src_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); + + return reset_src_data_step_; + }(); + + return reset_src_data_step; + } + + __device__ static constexpr auto GetDstCoordinateResetStep() + { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_lengths[I0] - 1; + + static_for<0, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index after last iteration in RunWrite(), if it has not being reset by + // RunWrite() + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + // + constexpr auto reset_dst_data_step = [&]() { + Index reset_dst_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); + + return reset_dst_data_step_; + }(); + + return reset_dst_data_step; + } + + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + template + __device__ void + MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx, + const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step( + src_desc, adjusted_step_idx, src_move_slice_window_step_hack); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + + // dst_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, + const Index& dst_slice_origin_step_idx) + { + // if dst coord was not reset by RunWrite(), then need to adjust the step here + const auto adjusted_step_idx = + DstResetCoordinateAfterRun ? dst_slice_origin_step_idx + : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); + + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + } + + private: + static constexpr auto thread_tensor_desc_ = + make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); + + StaticTensor + thread_tensor_; + + SrcCoord src_coord_; + DstCoord dst_coord_; +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 85c02a1b99d..2947ec9c43c 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -31,6 +31,7 @@ #include "amd_buffer_addressing.hpp" #include "static_buffer.hpp" #include "dynamic_buffer.hpp" +#include "is_known_at_compile_time.hpp" #include "inner_product.hpp" diff --git a/composable_kernel/include/utility/container_helper.hpp b/composable_kernel/include/utility/container_helper.hpp index a7ed8ec059e..a92e79908d9 100644 --- a/composable_kernel/include/utility/container_helper.hpp +++ b/composable_kernel/include/utility/container_helper.hpp @@ -373,19 +373,6 @@ set_container_subset(Tuple& y, Sequence picks, const Tuple& static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; }); } -template -__host__ __device__ constexpr auto to_tuple_of_number(const Container&) -{ - static_assert(is_known_at_compile_time::value, "wrong!"); - - return generate_tuple( - [&](auto i) { - constexpr index_t tmp = Container::At(i); - return Number{}; - }, - Container::Size()); -} - template __host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence) { diff --git a/composable_kernel/include/utility/ignore.hpp b/composable_kernel/include/utility/ignore.hpp new file mode 100644 index 00000000000..8a199159b3e --- /dev/null +++ b/composable_kernel/include/utility/ignore.hpp @@ -0,0 +1,21 @@ +#ifndef CK_IGNORE_HPP +#define CK_IGNORE_HPP + +// https://en.cppreference.com/w/cpp/utility/tuple/ignore + +namespace ck { + +namespace detail { +struct ignore_t +{ + template + constexpr void operator=(T&&) const noexcept + { + } +}; +} // namespace detail + +inline constexpr detail::ignore_t ignore; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp index 55a79d2594e..f568a9d824e 100644 --- a/composable_kernel/include/utility/tuple_helper.hpp +++ b/composable_kernel/include/utility/tuple_helper.hpp @@ -6,22 +6,6 @@ namespace ck { -template -struct is_known_at_compile_time> -{ - __host__ __device__ static constexpr bool IsKnownAtCompileTime() - { - return container_reduce( - Tuple{}, - [](auto x, bool r) { - return is_known_at_compile_time>::value & r; - }, - true); - } - - static constexpr bool value = IsKnownAtCompileTime(); -}; - template __host__ __device__ constexpr auto generate_tuple(F&& f, Number) { diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp index 89a2bdbde63..8e2c2016733 100644 --- a/composable_kernel/include/utility/type.hpp +++ b/composable_kernel/include/utility/type.hpp @@ -28,21 +28,6 @@ using remove_cvref_t = remove_cv_t>; template inline constexpr bool is_pointer_v = std::is_pointer::value; -template -struct is_known_at_compile_time; - -template <> -struct is_known_at_compile_time -{ - static constexpr bool value = false; -}; - -template -struct is_known_at_compile_time> -{ - static constexpr bool value = true; -}; - template ::type = false> __host__ __device__ constexpr Y as_type(X x) { From 02a1b50cb9d694b0aff3690615b1263857530887 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 16 Oct 2021 22:10:54 -0500 Subject: [PATCH 03/16] adding StaticTensor --- .../tensor_description/static_tensor.hpp | 103 +++++++++++-- .../tensor_description/tensor_adaptor.hpp | 14 ++ .../blockwise_gemm_xdlops.hpp | 5 +- .../threadwise_tensor_slice_transfer_v3r2.hpp | 11 +- .../include/utility/common_header.hpp | 2 + .../include/utility/static_buffer.hpp | 145 +++++------------- .../static_buffer_of_vector_type_v2.hpp | 100 ++++++++++++ composable_kernel/include/utility/tuple.hpp | 4 + ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp | 4 +- 9 files changed, 266 insertions(+), 122 deletions(-) create mode 100644 composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp index bdf98b7b702..447dca7c176 100644 --- a/composable_kernel/include/tensor_description/static_tensor.hpp +++ b/composable_kernel/include/tensor_description/static_tensor.hpp @@ -2,10 +2,10 @@ #define CK_STATIC_TENSOR_HPP #include "ignore.hpp" -#include "static_buffer.hpp" namespace ck { +// StaticTensor for Scalar template ::type = false> struct StaticTensor { - static constexpr index_t NDim = TensorDesc::GetNumOfDimension(); - static constexpr index_t buffer_size_ = TensorDesc{}.GetElementSpaceSize(); - - static constexpr auto desc_ = TensorDesc{}; - - using Buffer = StaticBuffer; + static constexpr auto desc_ = TensorDesc{}; + static constexpr index_t ndim_ = TensorDesc::GetNumOfDimension(); + static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize(); __host__ __device__ constexpr StaticTensor() : invalid_element_value_{0} {} @@ -27,8 +24,87 @@ struct StaticTensor { } + // read access + template ::value && Idx::Size() == ndim_, + bool>::type = false> + __host__ __device__ constexpr const T& operator[](Idx) const + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + return data_[Number{}]; + } + else + { + if constexpr(InvalidElementUseNumericalZeroValue) + { + return T{0}; + } + else + { + return invalid_element_value_; + } + } + } + + // write access + template ::value && Idx::Size() == ndim_, + bool>::type = false> + __host__ __device__ constexpr T& operator()(Idx) + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + return data_(Number{}); + } + else + { + return ignore; + } + } + + StaticBuffer data_; + T invalid_element_value_ = T{0}; +}; + +// StaticTensor for vector +template ::type = false> +struct StaticTensorTupleOfVectorBuffer +{ + static constexpr auto desc_ = TensorDesc{}; + static constexpr index_t ndim_ = TensorDesc::GetNumOfDimension(); + static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize(); + + static constexpr index_t num_of_vector_ = + math::integer_divide_ceil(element_space_size_, ScalarPerVector); + + __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {} + + __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(T invalid_element_value) + : invalid_element_value_{invalid_element_value} + { + } + + // read access template ::value && Idx::Size() == NDim, + typename enable_if::value && Idx::Size() == ndim_, bool>::type = false> __host__ __device__ constexpr const T& operator[](Idx) const { @@ -40,7 +116,7 @@ struct StaticTensor if constexpr(is_valid) { - return buffer_[Number{}]; + return data_[Number{}]; } else { @@ -55,10 +131,11 @@ struct StaticTensor } } + // write access template ::value && Idx::Size() == NDim, + typename enable_if::value && Idx::Size() == ndim_, bool>::type = false> - __host__ __device__ T& operator()(Idx) + __host__ __device__ constexpr T& operator()(Idx) { constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); @@ -68,7 +145,7 @@ struct StaticTensor if constexpr(is_valid) { - return buffer_(Number{}); + return data_(Number{}); } else { @@ -76,7 +153,7 @@ struct StaticTensor } } - Buffer buffer_; + StaticBufferTupleOfVector data_; T invalid_element_value_ = T{0}; }; diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp index 50a8088bbab..8787abd6ba6 100644 --- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp @@ -151,6 +151,20 @@ struct TensorAdaptor __host__ __device__ constexpr auto GetElementSize() const { return element_size_; } +#if 0 // debug + template + __host__ __device__ constexpr index_t GetTopDimensionLength(Number idim) const + { + // TODO: not implemented + } + + template + __host__ __device__ constexpr index_t GetBottomDimensionLength(Number idim) const + { + // TODO: not implemented + } +#endif + template __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const { diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp index 36c67832042..cb0aa1c326c 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp @@ -37,7 +37,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL); - StaticBufferV2, MRepeat * NRepeat, true> + StaticBufferOfVectorTypeV2, + MRepeat * NRepeat, + true> c_thread_buf_; __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index f46d99c1102..2e9c8c09f6a 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -330,7 +330,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 // copy data from thread_tensor_ to dst_tmp_vector static_for<0, DstScalarPerVector, 1>{}([&](auto i) { -#if 0 // debug +#if 0 dst_tmp_vector.template AsType()(i) = type_convert{}( thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]); #else @@ -596,8 +596,17 @@ struct ThreadwiseTensorSliceTransfer_v3r2 static constexpr auto thread_tensor_desc_ = make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); +#if 0 StaticTensor thread_tensor_; +#else + StaticTensorTupleOfVectorBuffer + thread_tensor_; +#endif SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 2947ec9c43c..7c9a6b60c00 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -30,6 +30,8 @@ #include "amd_address_space.hpp" #include "amd_buffer_addressing.hpp" #include "static_buffer.hpp" +// TODO remove this +#include "static_buffer_of_vector_type_v2.hpp" #include "dynamic_buffer.hpp" #include "is_known_at_compile_time.hpp" diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index 9615d10c597..97cd80451df 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -5,158 +5,93 @@ namespace ck { -template + bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed struct StaticBuffer : public StaticallyIndexedArray { using type = T; using base = StaticallyIndexedArray; - T invalid_element_value_ = T{0}; - __host__ __device__ constexpr StaticBuffer() : base{} {} - __host__ __device__ constexpr StaticBuffer(T invalid_element_value) - : base{}, invalid_element_value_{invalid_element_value} - { - } - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() { - return BufferAddressSpace; + return AddressSpace; } + __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } + + __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } + + // read access template - __host__ __device__ constexpr auto Get(Number i, bool is_valid_element) const + __host__ __device__ constexpr const T& operator[](Number i) const { - if constexpr(InvalidElementUseNumericalZeroValue) - { - return is_valid_element ? At(i) : T{0}; - } - else - { - return is_valid_element ? At(i) : invalid_element_value_; - } + return base::operator[](i); } + // write access template - __host__ __device__ void Set(Number i, bool is_valid_element, const T& x) + __host__ __device__ constexpr T& operator()(Number i) { - if(is_valid_element) - { - At(i) = x; - } + return base::operator()(i); } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } - - __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } }; -template -struct StaticBufferV2 : public StaticallyIndexedArray + index_t NumOfVector, + index_t ScalarPerVector, + bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed +struct StaticBufferTupleOfVector + : public StaticallyIndexedArray, NumOfVector> { using type = T; - using base = StaticallyIndexedArray; - - using VecBaseType = typename T::d1_t; - - __host__ __device__ static constexpr index_t GetVectorSize() - { - return sizeof(typename T::type) / sizeof(VecBaseType); - } + using base = StaticallyIndexedArray, NumOfVector>; - static constexpr index_t vector_size = GetVectorSize(); + static constexpr auto scalar_per_vector = Number{}; + static constexpr auto num_of_vector_ = Number{}; - VecBaseType invalid_element_value_ = VecBaseType{0}; - - T invalid_vec_value_ = T{0}; - - __host__ __device__ constexpr StaticBufferV2() : base{} {} - - __host__ __device__ constexpr StaticBufferV2(VecBaseType invalid_element_value) - : base{}, - invalid_vec_value_{invalid_element_value}, - invalid_element_value_{invalid_element_value} - { - } + __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {} __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() { - return BufferAddressSpace; + return AddressSpace; } - template - __host__ __device__ constexpr auto& GetVector(Number vec_id) - { - return this->At(vec_id); - } + __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } - template - __host__ __device__ constexpr const auto& GetVector(Number vec_id) const - { - return this->At(vec_id); - } + __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } + // read access template - __host__ __device__ constexpr auto& GetElement(Number i, bool) + __host__ __device__ constexpr const T& operator[](Number i) const { - constexpr auto vec_id = Number{}; - constexpr auto vec_off = Number{}; + constexpr auto vector_i = i / scalar_per_vector; + constexpr auto scalar_i = i % scalar_per_vector; - return this->At(vec_id).template AsType()(vec_off); - } - - template - __host__ __device__ constexpr auto GetElement(Number i, bool is_valid_element) const - { - constexpr auto vec_id = Number{}; - constexpr auto vec_off = Number{}; - - if constexpr(InvalidElementUseNumericalZeroValue) - { - return is_valid_element ? this->At(vec_id).template AsType()[vec_off] - : VecBaseType{0}; - } - else - { - return is_valid_element ? this->At(vec_id).template AsType()[vec_off] - : invalid_element_value_; - } + return base::operator[](vector_i).template AsType()[scalar_i]; } + // write access template - __host__ __device__ constexpr auto operator[](Number i) const + __host__ __device__ constexpr T& operator()(Number i) { - return GetElement(i, true); - } + constexpr auto vector_i = i / scalar_per_vector; + constexpr auto scalar_i = i % scalar_per_vector; - template - __host__ __device__ constexpr auto& operator()(Number i) - { - return GetElement(i, true); + return base::operator()(vector_i).template AsType()(scalar_i); } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } - - __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } }; -template +template __host__ __device__ constexpr auto make_static_buffer(Number) { - return StaticBuffer{}; -} - -template -__host__ __device__ constexpr auto make_static_buffer(Number, T invalid_element_value) -{ - return StaticBuffer{invalid_element_value}; + return StaticBuffer{}; } } // namespace ck diff --git a/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp b/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp new file mode 100644 index 00000000000..ed3ae201fcc --- /dev/null +++ b/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp @@ -0,0 +1,100 @@ +#ifndef CK_STATIC_BUFFER_OF_VECTOR_TYPE_V2_HPP +#define CK_STATIC_BUFFER_OF_VECTOR_TYPE_V2_HPP + +#include "statically_indexed_array.hpp" + +namespace ck { +template +struct StaticBufferOfVectorTypeV2 : public StaticallyIndexedArray +{ + using type = T; + using base = StaticallyIndexedArray; + + using VecBaseType = typename T::d1_t; + + __host__ __device__ static constexpr index_t GetVectorSize() + { + return sizeof(typename T::type) / sizeof(VecBaseType); + } + + static constexpr index_t vector_size = GetVectorSize(); + + VecBaseType invalid_element_value_ = VecBaseType{0}; + + T invalid_vec_value_ = T{0}; + + __host__ __device__ constexpr StaticBufferOfVectorTypeV2() : base{} {} + + __host__ __device__ constexpr StaticBufferOfVectorTypeV2(VecBaseType invalid_element_value) + : base{}, + invalid_vec_value_{invalid_element_value}, + invalid_element_value_{invalid_element_value} + { + } + + __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() + { + return BufferAddressSpace; + } + + template + __host__ __device__ constexpr auto& GetVector(Number vec_id) + { + return this->At(vec_id); + } + + template + __host__ __device__ constexpr const auto& GetVector(Number vec_id) const + { + return this->At(vec_id); + } + + template + __host__ __device__ constexpr auto& GetElement(Number i, bool) + { + constexpr auto vec_id = Number{}; + constexpr auto vec_off = Number{}; + + return this->At(vec_id).template AsType()(vec_off); + } + + template + __host__ __device__ constexpr auto GetElement(Number i, bool is_valid_element) const + { + constexpr auto vec_id = Number{}; + constexpr auto vec_off = Number{}; + + if constexpr(InvalidElementUseNumericalZeroValue) + { + return is_valid_element ? this->At(vec_id).template AsType()[vec_off] + : VecBaseType{0}; + } + else + { + return is_valid_element ? this->At(vec_id).template AsType()[vec_off] + : invalid_element_value_; + } + } + + template + __host__ __device__ constexpr auto operator[](Number i) const + { + return GetElement(i, true); + } + + template + __host__ __device__ constexpr auto& operator()(Number i) + { + return GetElement(i, true); + } + + __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } + + __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index 70f4d77d874..3dea944f6d1 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -117,6 +117,7 @@ struct Tuple : detail::TupleImpl __host__ __device__ constexpr const auto& At(Number) const { @@ -124,6 +125,7 @@ struct Tuple : detail::TupleImpl{}); } + // write access template __host__ __device__ constexpr auto& At(Number) { @@ -131,12 +133,14 @@ struct Tuple : detail::TupleImpl{}); } + // read access template __host__ __device__ constexpr const auto& operator[](Number i) const { return At(i); } + // write access template __host__ __device__ constexpr auto& operator()(Number i) { diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp index 01e5c57ab41..1b23aa1a8c9 100644 --- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp @@ -160,7 +160,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8; constexpr index_t GemmCThreadTransferDstScalarPerVector = 1; -#elif 0 +#elif 1 // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16 constexpr index_t BlockSize = 256; @@ -188,7 +188,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk( constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8; constexpr index_t GemmCThreadTransferDstScalarPerVector = 1; -#elif 1 +#elif 0 // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16 constexpr index_t BlockSize = 256; From 61fe6c9602bef4e21e64ea3c7969a44db9a8d4d5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Oct 2021 21:08:16 -0500 Subject: [PATCH 04/16] adding StaticTensor --- .../threadwise_tensor_slice_transfer_v3r2.hpp | 18 ++++- .../include/utility/data_type.hpp | 12 ++++ .../include/utility/static_buffer.hpp | 67 ++++++++++++++----- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index 2e9c8c09f6a..efc03dbe8d2 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -592,14 +592,30 @@ struct ThreadwiseTensorSliceTransfer_v3r2 move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } + __device__ constexpr auto GetSrcThreadBufferDescriptor() + { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + } + private: +#if 0 // debug static constexpr auto thread_tensor_desc_ = make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); -#if 0 StaticTensor thread_tensor_; #else + static constexpr auto thread_tensor_desc_ = + make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); + StaticTensorTupleOfVectorBuffer) template struct scalar_type; +// is_scalar_type +template +struct is_scalar_type +{ + static constexpr bool value = (scalar_type>::vector_size == 1); +}; + +// has_same_scalar_type +template +using has_same_scalar_type = is_same>::type, + typename scalar_type>::type>; + template struct scalar_type { diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index 97cd80451df..9692e5a8287 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -43,18 +43,19 @@ struct StaticBuffer : public StaticallyIndexedArray // static buffer for vector template // TODO remove this bool, no longer needed + bool InvalidElementUseNumericalZeroValue, // TODO remove this bool, no longer needed, + typename enable_if::value, bool>::type = false> struct StaticBufferTupleOfVector - : public StaticallyIndexedArray, NumOfVector> + : public StaticallyIndexedArray, NumOfVector> { - using type = T; - using base = StaticallyIndexedArray, NumOfVector>; + using V = typename vector_type::type; + using base = StaticallyIndexedArray, NumOfVector>; - static constexpr auto scalar_per_vector = Number{}; - static constexpr auto num_of_vector_ = Number{}; + static constexpr auto s_per_v = Number{}; + static constexpr auto num_of_v_ = Number{}; __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {} @@ -67,24 +68,56 @@ struct StaticBufferTupleOfVector __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } - // read access + // read access of S template - __host__ __device__ constexpr const T& operator[](Number i) const + __host__ __device__ constexpr const S& operator[](Number i) const { - constexpr auto vector_i = i / scalar_per_vector; - constexpr auto scalar_i = i % scalar_per_vector; + constexpr auto i_v = i / s_per_v; + constexpr auto i_s = i % s_per_v; - return base::operator[](vector_i).template AsType()[scalar_i]; + return base::operator[](i_v).template AsType()[i_s]; } - // write access + // write access of S template - __host__ __device__ constexpr T& operator()(Number i) + __host__ __device__ constexpr S& operator()(Number i) { - constexpr auto vector_i = i / scalar_per_vector; - constexpr auto scalar_i = i % scalar_per_vector; + constexpr auto i_v = i / s_per_v; + constexpr auto i_s = i % s_per_v; + + return base::operator()(i_v).template AsType()(i_s); + } + + // read access of X + template ::value, bool>::type = false> + __host__ __device__ constexpr auto GetAsType(Number i) const + { + constexpr index_t s_per_x = scalar_type>::vector_size; + + static_assert(s_per_v % s_per_x == 0, "wrong! V must one or multiple X"); + + constexpr auto i_v = i / s_per_v; + constexpr auto i_x = (i % s_per_v) / s_per_x; + + return base::operator[](i_v).template AsType()[i_x]; + } + + // write access of X + template ::value, bool>::type = false> + __host__ __device__ constexpr void SetAsType(Number i, X x) + { + constexpr index_t s_per_x = scalar_type>::vector_size; + + static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X"); + + constexpr auto i_v = i / s_per_v; + constexpr auto i_x = (i % s_per_v) / s_per_x; - return base::operator()(vector_i).template AsType()(scalar_i); + base::operator()(i_v).template AsType()(i_x) = x; } }; From 7c898b2a42577af095e05b0217dc46c286e40330 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Oct 2021 22:57:13 -0500 Subject: [PATCH 05/16] add missing constexpr --- .../include/tensor_description/multi_index_transform.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index 1a25e99f3bb..248148686bc 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -30,7 +30,8 @@ struct PassThrough __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; } template - __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) + __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low, + const UpIdx& idx_up) { static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, "wrong! inconsistent # of dimension"); @@ -1708,7 +1709,8 @@ struct Vectorize __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; } template - __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const + __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, + const UpIdx& idx_up) const { static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, "wrong! inconsistent # of dimension"); From ba8564641fbac87a8fa824c6cce97e341c1eee5a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Oct 2021 23:53:45 -0500 Subject: [PATCH 06/16] adding static tensor --- .../gridwise_gemm_xdlops_v2r3.hpp | 98 +++++++------- .../threadwise_tensor_slice_transfer_v3r2.hpp | 126 +++++++++++++----- 2 files changed, 144 insertions(+), 80 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp index e3b0054bec2..c1df3961d35 100644 --- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp @@ -397,56 +397,58 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 }(); // A matrix blockwise copy - auto a_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - ABlockTransferThreadSliceLengths_K0_M_K1, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_k0_m_k1_grid_desc), - decltype(a_k0_m_k1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>(a_k0_m_k1_grid_desc, - make_multi_index(0, m_block_data_idx_on_grid, 0), - a_k0_m_k1_block_desc, - make_multi_index(0, 0, 0)); + auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4< + BlockSize, + InMemoryDataOperationEnum_t::Set, + Sequence, + ABlockTransferThreadSliceLengths_K0_M_K1, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_k0_m_k1_grid_desc), + decltype(a_k0_m_k1_block_desc), + ABlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, // TODO: maybe ABlockTransferDstAccessOrder should be the same as + // ABlockTransferSrcAccessOrder? + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true>(a_k0_m_k1_grid_desc, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_k0_m_k1_block_desc, + make_multi_index(0, 0, 0)); // B matrix blockwise copy - auto b_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - BBlockTransferThreadSliceLengths_K0_N_K1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_k0_n_k1_grid_desc), - decltype(b_k0_n_k1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>(b_k0_n_k1_grid_desc, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_k0_n_k1_block_desc, - make_multi_index(0, 0, 0)); + auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4< + BlockSize, + InMemoryDataOperationEnum_t::Set, + Sequence, + BBlockTransferThreadSliceLengths_K0_N_K1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b_k0_n_k1_grid_desc), + decltype(b_k0_n_k1_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, // TODO: maybe BBlockTransferDstAccessOrder should be the same as + // BBlockTransferSrcAccessOrder? + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>(b_k0_n_k1_grid_desc, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_k0_n_k1_block_desc, + make_multi_index(0, 0, 0)); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index efc03dbe8d2..3579c10418e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -172,13 +172,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2 src_tmp_vector.template AsType()(Number<0>{}) = src_buf.template Get(src_coord_.GetOffset(), is_src_valid); - // copy data from src_tmp_vector to thread_tensor_ + // copy data from src_tmp_vector to src_thread_scratch_ static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { -#if 0 // debug - thread_tensor_(src_data_idx + i * src_scalar_step_in_vector) = - src_tmp_vector.template AsType()[i]; -#else - thread_tensor_(src_data_idx_seq + i * src_scalar_step_in_vector) = +#if 1 // debug + src_thread_scratch_(src_data_idx_seq + i * src_scalar_step_in_vector) = src_tmp_vector.template AsType()[i]; #endif }); @@ -228,10 +225,24 @@ struct ThreadwiseTensorSliceTransfer_v3r2 } } + __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch() + { + static_ford{}([&](auto idx) { + // convert from SrcData to DstData here + dst_thread_scratch_(idx) = type_convert{}(src_thread_scratch_[idx]); + }); + } + template __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) { +#if 1 // debug + // if there is transpose, it's done here + // TODO move this elsewhere + TransferDataFromSrcThreadScratchToDstThreadScratch(); +#endif + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, "wrong!"); @@ -328,15 +339,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2 vector_type_maker_t dst_tmp_vector; - // copy data from thread_tensor_ to dst_tmp_vector + // copy data from dst_thread_scratch_ to dst_tmp_vector static_for<0, DstScalarPerVector, 1>{}([&](auto i) { -#if 0 - dst_tmp_vector.template AsType()(i) = type_convert{}( - thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]); -#else - dst_tmp_vector.template AsType()(i) = type_convert{}( - thread_tensor_[dst_data_idx_seq + i * dst_scalar_step_in_vector]); -#endif + dst_tmp_vector.template AsType()(i) = + dst_thread_scratch_[dst_data_idx_seq + i * dst_scalar_step_in_vector]; }); using dst_vector_t = typename decltype(dst_tmp_vector)::type; @@ -592,36 +598,92 @@ struct ThreadwiseTensorSliceTransfer_v3r2 move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } - __device__ constexpr auto GetSrcThreadBufferDescriptor() +#if 1 // debug + __device__ static constexpr auto GetSrcThreadScratchDescriptor() { - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto src_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_access_lengths_and_vector_length = container_push_back( + sequence_to_tuple_of_number(src_access_lengths), Number{}); + + constexpr auto desc0 = + make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length); + + // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight + // TODO implemenet the general logic + constexpr auto desc1 = transform_tensor_descriptor( + desc0, + make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]), + make_merge_transform_v3_division_mod( + make_tuple(src_access_lengths_and_vector_length[I1], + src_access_lengths_and_vector_length[I3])), + make_pass_through_transform(src_access_lengths_and_vector_length[I2])), + make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return desc1; } - private: -#if 0 // debug - static constexpr auto thread_tensor_desc_ = - make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); + __device__ static constexpr auto GetDstThreadScratchDescriptor() + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - StaticTensor - thread_tensor_; -#else - static constexpr auto thread_tensor_desc_ = - make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); + constexpr auto dst_access_lengths_and_vector_length = container_push_back( + sequence_to_tuple_of_number(dst_access_lengths), Number{}); + + constexpr auto desc0 = + make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length); + + // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight + // TODO implemenet the general logic + constexpr auto desc1 = transform_tensor_descriptor( + desc0, + make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]), + make_pass_through_transform(dst_access_lengths_and_vector_length[I1]), + make_merge_transform_v3_division_mod( + make_tuple(dst_access_lengths_and_vector_length[I2], + dst_access_lengths_and_vector_length[I3]))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + return desc1; + } +#endif + + private: +#if 1 // debug + static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; + static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; StaticTensorTupleOfVectorBuffer + src_thread_scratch_; + + StaticTensorTupleOfVectorBuffer - thread_tensor_; + dst_thread_scratch_; #endif SrcCoord src_coord_; From ba74c9ff275c0d389466c01d532a3414d7e748d2 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Oct 2021 00:49:56 -0500 Subject: [PATCH 07/16] adding static tensor --- .../tensor_description/static_tensor.hpp | 71 ++++++++++++++++--- .../threadwise_tensor_slice_transfer_v3r2.hpp | 45 +++--------- .../include/utility/static_buffer.hpp | 4 +- 3 files changed, 75 insertions(+), 45 deletions(-) diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp index 447dca7c176..9bf9b13daf9 100644 --- a/composable_kernel/include/tensor_description/static_tensor.hpp +++ b/composable_kernel/include/tensor_description/static_tensor.hpp @@ -81,7 +81,7 @@ struct StaticTensor // StaticTensor for vector template ::value && Idx::Size() == ndim_, bool>::type = false> - __host__ __device__ constexpr const T& operator[](Idx) const + __host__ __device__ constexpr const S& operator[](Idx) const { constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); @@ -122,7 +122,7 @@ struct StaticTensorTupleOfVectorBuffer { if constexpr(InvalidElementUseNumericalZeroValue) { - return T{0}; + return S{0}; } else { @@ -131,11 +131,11 @@ struct StaticTensorTupleOfVectorBuffer } } - // write access + // write access of S template ::value && Idx::Size() == ndim_, bool>::type = false> - __host__ __device__ constexpr T& operator()(Idx) + __host__ __device__ constexpr S& operator()(Idx) { constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); @@ -153,8 +153,61 @@ struct StaticTensorTupleOfVectorBuffer } } - StaticBufferTupleOfVector data_; - T invalid_element_value_ = T{0}; + // read access of X + template ::value && + is_known_at_compile_time::value && Idx::Size() == ndim_, + bool>::type = false> + __host__ __device__ constexpr X GetAsType(Idx) const + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + return data_.template GetAsType(Number{}); + } + else + { + if constexpr(InvalidElementUseNumericalZeroValue) + { + // TODO: is this right way to initialize a vector? + return X{0}; + } + else + { + // TODO: is this right way to initialize a vector? + return X{invalid_element_value_}; + } + } + } + + // write access of X + template ::value && + is_known_at_compile_time::value && Idx::Size() == ndim_, + bool>::type = false> + __host__ __device__ constexpr void SetAsType(Idx, X x) + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord); + + if constexpr(is_valid) + { + data_.template SetAsType(Number{}, x); + } + } + + StaticBufferTupleOfVector data_; + S invalid_element_value_ = S{0}; }; template {}, Number{}); - constexpr auto src_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; constexpr auto src_dim_access_order = SrcDimAccessOrder{}; @@ -159,26 +156,17 @@ struct ThreadwiseTensorSliceTransfer_v3r2 #if 1 // debug constexpr auto src_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); -#endif - - vector_type_maker_t src_tmp_vector; - - using src_vector_t = typename decltype(src_tmp_vector)::type; const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); - // copy data from src_buf to src_tmp_vector - src_tmp_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_coord_.GetOffset(), is_src_valid); + using src_vector_t = typename vector_type_maker_t::type; - // copy data from src_tmp_vector to src_thread_scratch_ - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { -#if 1 // debug - src_thread_scratch_(src_data_idx_seq + i * src_scalar_step_in_vector) = - src_tmp_vector.template AsType()[i]; + // copy data from src_buf to src_thread_scratch_ + src_thread_scratch_.template SetAsType( + src_data_idx_seq, + src_buf.template Get(src_coord_.GetOffset(), is_src_valid)); #endif - }); constexpr auto move_on_dim = [&]() constexpr { @@ -259,9 +247,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto dst_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; constexpr auto dst_dim_access_order = DstDimAccessOrder{}; @@ -335,26 +320,18 @@ struct ThreadwiseTensorSliceTransfer_v3r2 #if 1 // debug constexpr auto dst_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); -#endif - vector_type_maker_t dst_tmp_vector; - - // copy data from dst_thread_scratch_ to dst_tmp_vector - static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - dst_tmp_vector.template AsType()(i) = - dst_thread_scratch_[dst_data_idx_seq + i * dst_scalar_step_in_vector]; - }); - - using dst_vector_t = typename decltype(dst_tmp_vector)::type; - - // copy data from dst_tmp_vector to dst_buf const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); + using dst_vector_t = typename vector_type_maker_t::type; + + // copy data from dst_thread_scratch_ to dst_buf dst_buf.template Set( dst_coord_.GetOffset(), is_dst_valid, - dst_tmp_vector.template AsType()[Number<0>{}]); + dst_thread_scratch_.template GetAsType(dst_data_idx_seq)); +#endif constexpr auto move_on_dim = [&]() constexpr { @@ -681,7 +658,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 StaticTensorTupleOfVectorBuffer dst_thread_scratch_; #endif diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index 9692e5a8287..62cfb2f492c 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -94,7 +94,7 @@ struct StaticBufferTupleOfVector typename enable_if::value, bool>::type = false> __host__ __device__ constexpr auto GetAsType(Number i) const { - constexpr index_t s_per_x = scalar_type>::vector_size; + constexpr auto s_per_x = Number>::vector_size>{}; static_assert(s_per_v % s_per_x == 0, "wrong! V must one or multiple X"); @@ -110,7 +110,7 @@ struct StaticBufferTupleOfVector typename enable_if::value, bool>::type = false> __host__ __device__ constexpr void SetAsType(Number i, X x) { - constexpr index_t s_per_x = scalar_type>::vector_size; + constexpr auto s_per_x = Number>::vector_size>{}; static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X"); From 977185e685d952488e3ae252a54c4a3013397432 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Oct 2021 03:29:25 -0500 Subject: [PATCH 08/16] adding transpose --- .../threadwise_tensor_slice_transfer_v3r2.hpp | 107 +++++++++++++++++- .../include/utility/common_header.hpp | 1 + .../include/utility/transpose_vectors.hpp | 60 ++++++++++ .../include/device_gemm_xdlops_km_kn_mn.hpp | 6 +- 4 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 composable_kernel/include/utility/transpose_vectors.hpp diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index 736942e23ec..da1c2d3eece 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -8,6 +8,38 @@ namespace ck { +namespace detail { +// TODO: How to fix this? It uses an struct instead of lambda because lambda +// doesn't have constructor +template +struct lambda_scalar_per_access_for_src_and_dst +{ + __host__ __device__ constexpr auto operator()(index_t i) const + { + if(i == SrcVectorDim && i == DstVectorDim) + { + return math::lcm(SrcScalarPerVector, DstScalarPerVector); + } + else if(i == SrcVectorDim) + { + return SrcScalarPerVector; + } + else if(i == DstVectorDim) + { + return DstScalarPerVector; + } + else + { + return 1; + } + } +}; + +} // namespace detail + // Assume: // 1. src_desc and dst_desc are not known at compile-time // 2. SrcBuffer and DstBuffer are DynamicBuffer @@ -185,7 +217,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 } (); - // move + // move src coord static_for<0, nDim, 1>{}([&](auto i) { if constexpr(move_on_dim[i]) { @@ -215,10 +247,81 @@ struct ThreadwiseTensorSliceTransfer_v3r2 __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch() { +#if 0 // debug static_ford{}([&](auto idx) { // convert from SrcData to DstData here dst_thread_scratch_(idx) = type_convert{}(src_thread_scratch_[idx]); }); +#else + if constexpr(SrcVectorDim == DstVectorDim) + { + static_ford{}([&](auto idx) { + // convert from SrcData to DstData here + dst_thread_scratch_(idx) = type_convert{}(src_thread_scratch_[idx]); + }); + } + else + { + // TODO type_convert is not used yet!!!!! + using src_vector_t = typename vector_type_maker_t::type; + using dst_vector_t = typename vector_type_maker_t::type; + + // each transpose does + // DstScalarPerVector # of src vectors in src_thread_scratch_ + // SrcScalarPerVector # of dst vectors in dst_thread_scratch_ + constexpr index_t num_src_vector = Number{}; + constexpr index_t num_dst_vector = Number{}; + + // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose + // TODO: make this logic generic for all scenario + static_assert(SrcVectorDim != DstVectorDim, "wrong"); + + constexpr auto src_scalar_step_in_vector = generate_sequence( + detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto dst_scalar_step_in_vector = generate_sequence( + detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access_for_src_and_dst{}, + Number{}); + + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + + static_ford{}([&](auto access_idx) { + constexpr auto data_idx = access_idx * scalar_per_access; + + constexpr auto data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); + + // get DstScalarPerVector # of src vectors from src_thread_scratch_ + const auto src_vectors = generate_tuple( + [&](auto i) { + // i increment corresponds to movement in DstVectorDim + return src_thread_scratch_.template GetAsType( + data_idx_seq + i * dst_scalar_step_in_vector); + }, + Number{}); + + StaticallyIndexedArray dst_vectors; + + // do data transpose + // TODO type_convert is not used yet!!!!! + transpose_vectors{}(src_vectors, + dst_vectors); + + // put SrcScalarPerVector # of dst vectors into dst_thread_scratch_ + static_for<0, num_dst_vector, 1>{}([&](auto i) { + // i increment corresponds to movement in DstVectorDim + dst_thread_scratch_.template SetAsType( + data_idx_seq + i * src_scalar_step_in_vector, dst_vectors[i]); + }); + }); + } +#endif } template @@ -350,7 +453,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 } (); - // move + // move dst coord static_for<0, nDim, 1>{}([&](auto i) { if constexpr(move_on_dim[i]) { diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 7c9a6b60c00..4afdc7d788f 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -34,6 +34,7 @@ #include "static_buffer_of_vector_type_v2.hpp" #include "dynamic_buffer.hpp" #include "is_known_at_compile_time.hpp" +#include "transpose_vectors.hpp" #include "inner_product.hpp" diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp new file mode 100644 index 00000000000..ef4e10c49db --- /dev/null +++ b/composable_kernel/include/utility/transpose_vectors.hpp @@ -0,0 +1,60 @@ +#ifndef CK_TRANSPOSE_VECTORS_AMD_HPP +#define CK_TRANSPOSE_VECTORS_AMD_HPP + +#include "config.hpp" +#include "statically_indexed_array.hpp" +#include "data_type.hpp" + +namespace ck { + +template ::value, bool>::type = false> +struct transpose_vectors; + +// transpose fp16 2x2 +__device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1) +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + +#if 1 // debug + const vector_type vx0{x0}, vx1{x1}; + vector_type vy0, vy1; + + vy0.template AsType()(I0) = vx0.template AsType()[I0]; + vy0.template AsType()(I1) = vx1.template AsType()[I0]; + + vy1.template AsType()(I0) = vx0.template AsType()[I1]; + vy1.template AsType()(I1) = vx1.template AsType()[I1]; + + y0 = vy0.template AsType()[I0]; + y1 = vy1.template AsType()[I0]; +#endif +} + +template +struct transpose_vectors +{ + using X = typename vector_type::type; + using Y = typename vector_type::type; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __device__ void operator()(const StaticallyIndexedArray& xs, + StaticallyIndexedArray& ys) + { + // TODO make this generic for any NX, NY + static_assert((NX == 2 && NY == 2), "wrong!"); + + if constexpr(NX == 2 && NY == 2) + { + transpose_fp16_2x2(xs[I0], xs[I1], ys(I0), ys(I1)); + } + } +}; + +} // namespace ck +#endif diff --git a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp index c44aa7d9a27..396a55caf89 100644 --- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp +++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp @@ -180,14 +180,14 @@ void device_gemm_xdlops_km_kn_mn(const Tensor& a_k_m, using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 8>; using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>; - constexpr index_t ABlockTransferSrcScalarPerVector_M = 4; - constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8; + constexpr index_t ABlockTransferSrcScalarPerVector_M = 2; + constexpr index_t ABlockTransferDstScalarPerVector_K1 = 2; using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 8>; using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>; constexpr index_t BBlockTransferSrcScalarPerVector_N = 2; - constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8; + constexpr index_t BBlockTransferDstScalarPerVector_K1 = 2; constexpr index_t CThreadTransferDstScalarPerVector = 1; #elif 0 From bc917d10f6cdde98d46b2500b0f945588fc07dca Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Oct 2021 04:33:06 -0500 Subject: [PATCH 09/16] add inline asm for transpose 2x2 of half_t --- .../include/utility/transpose_vectors.hpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp index ef4e10c49db..03327e280a9 100644 --- a/composable_kernel/include/utility/transpose_vectors.hpp +++ b/composable_kernel/include/utility/transpose_vectors.hpp @@ -16,10 +16,10 @@ struct transpose_vectors; // transpose fp16 2x2 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1) { +#if 0 // debug static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; -#if 1 // debug const vector_type vx0{x0}, vx1{x1}; vector_type vy0, vy1; @@ -31,14 +31,24 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t y0 = vy0.template AsType()[I0]; y1 = vy1.template AsType()[I0]; +#else + asm volatile("\n \ + v_pack_b32_f16 %0, %2, %3 \n \ + v_pack_b32_f16 %1, %2, %3, op_sel:[1, 1] \n \ + " + : "=v"(y0), "=v"(y1) + : "v"(x0), "v"(x1), "0"(y0), "1"(y1)); #endif } template struct transpose_vectors { - using X = typename vector_type::type; - using Y = typename vector_type::type; + static constexpr index_t s_per_x = NY; + static constexpr index_t s_per_y = NX; + + using X = typename vector_type::type; + using Y = typename vector_type::type; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; From 6561289eab13bc550cdfcc8460d6e39734923241 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Oct 2021 21:48:53 -0500 Subject: [PATCH 10/16] add general transpose_vectors(), but have unnecessary register initialization using v_mov --- .../tensor_description/static_tensor.hpp | 4 ++ .../include/utility/static_buffer.hpp | 6 +++ .../include/utility/transpose_vectors.hpp | 47 ++++++++++++++++--- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp index 9bf9b13daf9..86f3e3d1676 100644 --- a/composable_kernel/include/tensor_description/static_tensor.hpp +++ b/composable_kernel/include/tensor_description/static_tensor.hpp @@ -95,6 +95,8 @@ struct StaticTensorTupleOfVectorBuffer static constexpr index_t num_of_vector_ = math::integer_divide_ceil(element_space_size_, ScalarPerVector); + using V = vector_type; + __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {} __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value) @@ -103,6 +105,7 @@ struct StaticTensorTupleOfVectorBuffer } // read access of S + // Idx is for S, not V. Idx should be aligned with V template ::value && Idx::Size() == ndim_, bool>::type = false> @@ -132,6 +135,7 @@ struct StaticTensorTupleOfVectorBuffer } // write access of S + // Idx is for S, not V. Idx should be aligned with V template ::value && Idx::Size() == ndim_, bool>::type = false> diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index 62cfb2f492c..02deb06d6d6 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -69,6 +69,7 @@ struct StaticBufferTupleOfVector __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } // read access of S + // i is offset of S template __host__ __device__ constexpr const S& operator[](Number i) const { @@ -79,6 +80,7 @@ struct StaticBufferTupleOfVector } // write access of S + // i is offset of S template __host__ __device__ constexpr S& operator()(Number i) { @@ -89,6 +91,7 @@ struct StaticBufferTupleOfVector } // read access of X + // i is offset of S, not X. i should be aligned to X template ::value, bool>::type = false> @@ -97,6 +100,7 @@ struct StaticBufferTupleOfVector constexpr auto s_per_x = Number>::vector_size>{}; static_assert(s_per_v % s_per_x == 0, "wrong! V must one or multiple X"); + static_assert(i % s_per_x == 0, "wrong!"); constexpr auto i_v = i / s_per_v; constexpr auto i_x = (i % s_per_v) / s_per_x; @@ -105,6 +109,7 @@ struct StaticBufferTupleOfVector } // write access of X + // i is offset of S, not X. i should be aligned to X template ::value, bool>::type = false> @@ -113,6 +118,7 @@ struct StaticBufferTupleOfVector constexpr auto s_per_x = Number>::vector_size>{}; static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X"); + static_assert(i % s_per_x == 0, "wrong!"); constexpr auto i_v = i / s_per_v; constexpr auto i_x = (i % s_per_v) / s_per_x; diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp index 03327e280a9..0b865f0ba66 100644 --- a/composable_kernel/include/utility/transpose_vectors.hpp +++ b/composable_kernel/include/utility/transpose_vectors.hpp @@ -44,25 +44,58 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t template struct transpose_vectors { + // we got [NY * NX] ammount of S data to be transposed static constexpr index_t s_per_x = NY; static constexpr index_t s_per_y = NX; + using S = half_t; using X = typename vector_type::type; using Y = typename vector_type::type; - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - __device__ void operator()(const StaticallyIndexedArray& xs, - StaticallyIndexedArray& ys) + __device__ void operator()(const StaticallyIndexedArray& x_tuple, + StaticallyIndexedArray& y_tuple) { - // TODO make this generic for any NX, NY + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + +#if 0 static_assert((NX == 2 && NY == 2), "wrong!"); if constexpr(NX == 2 && NY == 2) { - transpose_fp16_2x2(xs[I0], xs[I1], ys(I0), ys(I1)); + transpose_fp16_2x2(x_tuple[I0], x_tuple[I1], y_tuple(I0), y_tuple(I1)); } +#else + static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!"); + + // create tuple of vector_type for holding data from x_tuple + const auto vx_tuple = generate_tuple( + [&](auto i) { return vector_type{x_tuple[i]}; }, Number{}); + + // create tuple of vector_type to hold intermediate data for y_tuple + auto vy_tuple = + generate_tuple([&](auto) { return vector_type{}; }, Number{}); + + // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple + static_for<0, NY, 2>{}([&](auto iy) { + static_for<0, NX, 2>{}([&](auto ix) { + // reference to 2 half2_t data from vx_tuple + const auto& x_s2_0 = vx_tuple[ix].template AsType()[iy / I2]; + const auto& x_s2_1 = vx_tuple[ix + I1].template AsType()[iy / I2]; + + // reference to 2 half2_t data from vy_tuple + auto& y_s2_0 = vy_tuple(iy).template AsType()(ix / I2); + auto& y_s2_1 = vy_tuple(iy + I1).template AsType()(ix / I2); + + // transpose + transpose_fp16_2x2(x_s2_0, x_s2_1, y_s2_0, y_s2_1); + }); + }); + + // copy data from vy_tuple into y_tuple + static_for<0, NY, 1>{}([&](auto i) { y_tuple(i) = vy_tuple[i].template AsType()[I0]; }); +#endif } }; From 9ad748713638df1aa9a5e6566794eb04472b1e6f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Oct 2021 00:26:47 -0500 Subject: [PATCH 11/16] fix unnecessary register initialization in transpose_vector by using more pass-by-reference --- .../tensor_description/static_tensor.hpp | 38 ++++++++++++++++--- .../blockwise_tensor_slice_transfer.hpp | 2 +- .../threadwise_tensor_slice_transfer_v3r2.hpp | 36 +++++++++--------- .../include/utility/static_buffer.hpp | 32 ++++++++++++++-- .../utility/statically_indexed_array.hpp | 34 +++++++++++++---- .../include/utility/transpose_vectors.hpp | 31 +++------------ composable_kernel/include/utility/tuple.hpp | 7 ++++ .../include/utility/tuple_helper.hpp | 7 ++++ .../include/device_gemm_xdlops_km_kn_mn.hpp | 6 +-- 9 files changed, 128 insertions(+), 65 deletions(-) diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp index 86f3e3d1676..e71980b8183 100644 --- a/composable_kernel/include/tensor_description/static_tensor.hpp +++ b/composable_kernel/include/tensor_description/static_tensor.hpp @@ -104,8 +104,8 @@ struct StaticTensorTupleOfVectorBuffer { } - // read access of S - // Idx is for S, not V. Idx should be aligned with V + // Get S + // Idx is for S, not V template ::value && Idx::Size() == ndim_, bool>::type = false> @@ -134,8 +134,8 @@ struct StaticTensorTupleOfVectorBuffer } } - // write access of S - // Idx is for S, not V. Idx should be aligned with V + // Set S + // Idx is for S, not V template ::value && Idx::Size() == ndim_, bool>::type = false> @@ -157,7 +157,8 @@ struct StaticTensorTupleOfVectorBuffer } } - // read access of X + // Get X + // Idx is for S, not X. Idx should be aligned with X template ::value && @@ -190,7 +191,8 @@ struct StaticTensorTupleOfVectorBuffer } } - // write access of X + // Set X + // Idx is for S, not X. Idx should be aligned with X template ::value && @@ -210,6 +212,30 @@ struct StaticTensorTupleOfVectorBuffer } } + // Get read access to V. No is_valid check + // Idx is for S, not V. Idx should be aligned with V + template + __host__ __device__ constexpr const V& GetVectorTypeReference(Idx) const + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + return data_.GetVectorTypeReference(Number{}); + } + + // Get read access to V. No is_valid check + // Idx is for S, not V. Idx should be aligned with V + template + __host__ __device__ constexpr V& GetVectorTypeReference(Idx) + { + constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{})); + + constexpr index_t offset = coord.GetOffset(); + + return data_.GetVectorTypeReference(Number{}); + } + StaticBufferTupleOfVector data_; S invalid_element_value_ = S{0}; }; diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp index e0330853a74..f815a47f693 100644 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp @@ -5,7 +5,7 @@ #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" #include "cluster_descriptor.hpp" -//#include "threadwise_tensor_slice_transfer.hpp" +#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer_v3r2.hpp" namespace ck { diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index da1c2d3eece..d3a77482829 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -262,9 +262,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 } else { - // TODO type_convert is not used yet!!!!! - using src_vector_t = typename vector_type_maker_t::type; - using dst_vector_t = typename vector_type_maker_t::type; // each transpose does // DstScalarPerVector # of src vectors in src_thread_scratch_ @@ -297,28 +294,33 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); - // get DstScalarPerVector # of src vectors from src_thread_scratch_ - const auto src_vectors = generate_tuple( - [&](auto i) { + // TODO type_convert is not used yet!!!!! + using src_vector_t = vector_type_maker_t; + using dst_vector_t = vector_type_maker_t; + + // get DstScalarPerVector # of read-only references to src vectors from + // src_thread_scratch_ + const auto src_vector_refs = generate_tie( + [&](auto i) -> const src_vector_t& { // i increment corresponds to movement in DstVectorDim - return src_thread_scratch_.template GetAsType( + return src_thread_scratch_.GetVectorTypeReference( data_idx_seq + i * dst_scalar_step_in_vector); }, Number{}); - StaticallyIndexedArray dst_vectors; + // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_ + auto dst_vector_refs = generate_tie( + [&](auto i) -> dst_vector_t& { + // i increment corresponds to movement in SrcVectorDim + return dst_thread_scratch_.GetVectorTypeReference( + data_idx_seq + i * src_scalar_step_in_vector); + }, + Number{}); // do data transpose // TODO type_convert is not used yet!!!!! - transpose_vectors{}(src_vectors, - dst_vectors); - - // put SrcScalarPerVector # of dst vectors into dst_thread_scratch_ - static_for<0, num_dst_vector, 1>{}([&](auto i) { - // i increment corresponds to movement in DstVectorDim - dst_thread_scratch_.template SetAsType( - data_idx_seq + i * src_scalar_step_in_vector, dst_vectors[i]); - }); + transpose_vectors{}( + src_vector_refs, dst_vector_refs); }); } #endif diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index 02deb06d6d6..1deb0780252 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -68,7 +68,7 @@ struct StaticBufferTupleOfVector __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } - // read access of S + // Get S // i is offset of S template __host__ __device__ constexpr const S& operator[](Number i) const @@ -79,7 +79,7 @@ struct StaticBufferTupleOfVector return base::operator[](i_v).template AsType()[i_s]; } - // write access of S + // Set S // i is offset of S template __host__ __device__ constexpr S& operator()(Number i) @@ -90,7 +90,7 @@ struct StaticBufferTupleOfVector return base::operator()(i_v).template AsType()(i_s); } - // read access of X + // Get X // i is offset of S, not X. i should be aligned to X template ()[i_x]; } - // write access of X + // Set X // i is offset of S, not X. i should be aligned to X template ()(i_x) = x; } + + // Get read access to vector_type V + // i is offset of S, not V. i should be aligned to V + template + __host__ __device__ constexpr const auto& GetVectorTypeReference(Number i) const + { + static_assert(i % s_per_v == 0, "wrong!"); + + constexpr auto i_v = i / s_per_v; + + return base::operator[](i_v); + } + + // Get write access to vector_type V + // i is offset of S, not V. i should be aligned to V + template + __host__ __device__ constexpr auto& GetVectorTypeReference(Number i) + { + static_assert(i % s_per_v == 0, "wrong!"); + + constexpr auto i_v = i / s_per_v; + + return base::operator()(i_v); + } }; template diff --git a/composable_kernel/include/utility/statically_indexed_array.hpp b/composable_kernel/include/utility/statically_indexed_array.hpp index f30a3a9ee63..372751faf16 100644 --- a/composable_kernel/include/utility/statically_indexed_array.hpp +++ b/composable_kernel/include/utility/statically_indexed_array.hpp @@ -8,20 +8,38 @@ namespace ck { namespace detail { +template +struct tuple_concat; -template -__host__ __device__ constexpr auto generate_same_type_tuple() +template +struct tuple_concat, Tuple> { - return generate_tuple([](auto) -> T { return T{}; }, Number{}); -} + using type = Tuple; +}; -template -using same_type_tuple = decltype(generate_same_type_tuple()); +template +struct StaticallyIndexedArrayImpl +{ + using type = + typename tuple_concat::type, + typename StaticallyIndexedArrayImpl::type>::type; +}; +template +struct StaticallyIndexedArrayImpl +{ + using type = Tuple<>; +}; + +template +struct StaticallyIndexedArrayImpl +{ + using type = Tuple; +}; } // namespace detail -template -using StaticallyIndexedArray = detail::same_type_tuple; +template +using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl::type; template __host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs) diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp index 0b865f0ba66..a000c14e06c 100644 --- a/composable_kernel/include/utility/transpose_vectors.hpp +++ b/composable_kernel/include/utility/transpose_vectors.hpp @@ -48,35 +48,18 @@ struct transpose_vectors static constexpr index_t s_per_x = NY; static constexpr index_t s_per_y = NX; - using S = half_t; - using X = typename vector_type::type; - using Y = typename vector_type::type; + using S = half_t; + using VX = vector_type; + using VY = vector_type; - __device__ void operator()(const StaticallyIndexedArray& x_tuple, - StaticallyIndexedArray& y_tuple) + __device__ void operator()(const StaticallyIndexedArray& vx_tuple, + StaticallyIndexedArray& vy_tuple) { - static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; -#if 0 - static_assert((NX == 2 && NY == 2), "wrong!"); - - if constexpr(NX == 2 && NY == 2) - { - transpose_fp16_2x2(x_tuple[I0], x_tuple[I1], y_tuple(I0), y_tuple(I1)); - } -#else static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!"); - // create tuple of vector_type for holding data from x_tuple - const auto vx_tuple = generate_tuple( - [&](auto i) { return vector_type{x_tuple[i]}; }, Number{}); - - // create tuple of vector_type to hold intermediate data for y_tuple - auto vy_tuple = - generate_tuple([&](auto) { return vector_type{}; }, Number{}); - // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple static_for<0, NY, 2>{}([&](auto iy) { static_for<0, NX, 2>{}([&](auto ix) { @@ -92,10 +75,6 @@ struct transpose_vectors transpose_fp16_2x2(x_s2_0, x_s2_1, y_s2_0, y_s2_1); }); }); - - // copy data from vy_tuple into y_tuple - static_for<0, NY, 1>{}([&](auto i) { y_tuple(i) = vy_tuple[i].template AsType()[I0]; }); -#endif } }; diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index 3dea944f6d1..96cab4b99ee 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -166,5 +166,12 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs) return Tuple...>(std::forward(xs)...); } +// https://en.cppreference.com/w/cpp/utility/tuple/tie +template +constexpr Tuple tie(Args&... args) noexcept +{ + return {args...}; +} + } // namespace ck #endif diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp index f568a9d824e..4e5b9cf97c8 100644 --- a/composable_kernel/include/utility/tuple_helper.hpp +++ b/composable_kernel/include/utility/tuple_helper.hpp @@ -13,6 +13,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number) typename arithmetic_sequence_gen<0, N, 1>::type{}); } +template +__host__ __device__ constexpr auto generate_tie(F&& f, Number) +{ + return unpack([&f](auto&&... xs) { return tie(f(xs)...); }, + typename arithmetic_sequence_gen<0, N, 1>::type{}); +} + namespace detail { template diff --git a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp index 396a55caf89..c44aa7d9a27 100644 --- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp +++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp @@ -180,14 +180,14 @@ void device_gemm_xdlops_km_kn_mn(const Tensor& a_k_m, using ABlockTransferThreadSliceLengths_K0_M_K1 = Sequence<1, 4, 8>; using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>; - constexpr index_t ABlockTransferSrcScalarPerVector_M = 2; - constexpr index_t ABlockTransferDstScalarPerVector_K1 = 2; + constexpr index_t ABlockTransferSrcScalarPerVector_M = 4; + constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8; using BBlockTransferThreadSliceLengths_K0_N_K1 = Sequence<1, 2, 8>; using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>; constexpr index_t BBlockTransferSrcScalarPerVector_N = 2; - constexpr index_t BBlockTransferDstScalarPerVector_K1 = 2; + constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8; constexpr index_t CThreadTransferDstScalarPerVector = 1; #elif 0 From 405e74a052d1b13de6169834b7daa8beb6b62d3d Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Oct 2021 20:05:32 -0500 Subject: [PATCH 12/16] add hardcoded logic for NHWC wrw --- .../threadwise_tensor_slice_transfer_v3r2.hpp | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index d3a77482829..c776387970e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -687,6 +687,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); @@ -699,7 +700,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto desc0 = make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length); - // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight +#if 0 + // TODO this is hardcoded for GEMM TN layout // TODO implemenet the general logic constexpr auto desc1 = transform_tensor_descriptor( desc0, @@ -710,6 +712,20 @@ struct ThreadwiseTensorSliceTransfer_v3r2 make_pass_through_transform(src_access_lengths_and_vector_length[I2])), make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); +#else + // TODO this is hardcoded for NHWC backward-weight kernel + // TODO implemenet the general logic + constexpr auto desc1 = transform_tensor_descriptor( + desc0, + make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]), + make_pass_through_transform(src_access_lengths_and_vector_length[I1]), + make_merge_transform_v3_division_mod( + make_tuple(src_access_lengths_and_vector_length[I2], + src_access_lengths_and_vector_length[I4])), + make_pass_through_transform(src_access_lengths_and_vector_length[I3])), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); +#endif return desc1; } @@ -720,6 +736,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); @@ -731,8 +748,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto desc0 = make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length); - - // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight +#if 0 + // TODO this is hardcoded for GEMM TN layout // TODO implemenet the general logic constexpr auto desc1 = transform_tensor_descriptor( desc0, @@ -743,6 +760,20 @@ struct ThreadwiseTensorSliceTransfer_v3r2 dst_access_lengths_and_vector_length[I3]))), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); +#else + // TODO this is hardcoded for NHWC backward-weight kernel + // TODO implemenet the general logic + constexpr auto desc1 = transform_tensor_descriptor( + desc0, + make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]), + make_pass_through_transform(dst_access_lengths_and_vector_length[I1]), + make_pass_through_transform(dst_access_lengths_and_vector_length[I2]), + make_merge_transform_v3_division_mod( + make_tuple(dst_access_lengths_and_vector_length[I3], + dst_access_lengths_and_vector_length[I4]))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); +#endif return desc1; } From 59314e4c60b03a1190c8ee69243949ce97dd35a2 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Oct 2021 21:48:28 -0500 Subject: [PATCH 13/16] improve asm for v_pack --- .../threadwise_tensor_slice_transfer_v3r2.hpp | 15 +++------------ .../include/utility/transpose_vectors.hpp | 15 ++++++++++----- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index c776387970e..60d24857e70 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -185,7 +185,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 src_scalar_per_access; }(); -#if 1 // debug constexpr auto src_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); @@ -198,7 +197,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 src_thread_scratch_.template SetAsType( src_data_idx_seq, src_buf.template Get(src_coord_.GetOffset(), is_src_valid)); -#endif constexpr auto move_on_dim = [&]() constexpr { @@ -330,11 +328,9 @@ struct ThreadwiseTensorSliceTransfer_v3r2 __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) { -#if 1 // debug - // if there is transpose, it's done here - // TODO move this elsewhere + // if there is transpose, it's done here + // TODO move this elsewhere TransferDataFromSrcThreadScratchToDstThreadScratch(); -#endif static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, @@ -422,7 +418,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 dst_scalar_per_access; }(); -#if 1 // debug constexpr auto dst_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); @@ -436,7 +431,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 dst_coord_.GetOffset(), is_dst_valid, dst_thread_scratch_.template GetAsType(dst_data_idx_seq)); -#endif constexpr auto move_on_dim = [&]() constexpr { @@ -680,7 +674,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); } -#if 1 // debug __device__ static constexpr auto GetSrcThreadScratchDescriptor() { constexpr auto I0 = Number<0>{}; @@ -748,6 +741,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto desc0 = make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length); + #if 0 // TODO this is hardcoded for GEMM TN layout // TODO implemenet the general logic @@ -777,10 +771,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2 return desc1; } -#endif private: -#if 1 // debug static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; @@ -797,7 +789,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 decltype(dst_thread_scratch_desc_), true> dst_thread_scratch_; -#endif SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp index a000c14e06c..866241a9479 100644 --- a/composable_kernel/include/utility/transpose_vectors.hpp +++ b/composable_kernel/include/utility/transpose_vectors.hpp @@ -16,7 +16,7 @@ struct transpose_vectors; // transpose fp16 2x2 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1) { -#if 0 // debug +#if 0 static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -33,11 +33,16 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t y1 = vy1.template AsType()[I0]; #else asm volatile("\n \ - v_pack_b32_f16 %0, %2, %3 \n \ - v_pack_b32_f16 %1, %2, %3, op_sel:[1, 1] \n \ + v_pack_b32_f16 %0, %1, %2 \n \ " - : "=v"(y0), "=v"(y1) - : "v"(x0), "v"(x1), "0"(y0), "1"(y1)); + : "=v"(y0) + : "v"(x0), "v"(x1)); + + asm volatile("\n \ + v_pack_b32_f16 %0, %1, %2, op_sel:[1, 1] \n \ + " + : "=v"(y1) + : "v"(x0), "v"(x1)); #endif } From 310f9ee8428c145683df9375a81f24d81c8e95b0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Oct 2021 22:44:50 -0500 Subject: [PATCH 14/16] make ThreadwiseTensorSliceTransfer_v3r2 support any tensor --- .../blockwise_tensor_slice_transfer.hpp | 1 - .../threadwise_tensor_slice_transfer_v3r2.hpp | 143 +++++++++--------- .../src/gemm_driver_offline.cpp | 6 +- 3 files changed, 73 insertions(+), 77 deletions(-) diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp index f815a47f693..d03bda8fd92 100644 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp @@ -5,7 +5,6 @@ #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" #include "cluster_descriptor.hpp" -#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer_v3r2.hpp" namespace ck { diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index 60d24857e70..c906e3bf9e1 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -245,12 +245,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2 __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch() { -#if 0 // debug - static_ford{}([&](auto idx) { - // convert from SrcData to DstData here - dst_thread_scratch_(idx) = type_convert{}(src_thread_scratch_[idx]); - }); -#else + // TODO make this logic more generic + // TODO missing type_convert !!!!!!!!!!!!! + + // if constexpr(SrcVectorDim == DstVectorDim) { static_ford{}([&](auto idx) { @@ -321,7 +319,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 src_vector_refs, dst_vector_refs); }); } -#endif } template @@ -676,12 +673,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2 __device__ static constexpr auto GetSrcThreadScratchDescriptor() { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); @@ -690,47 +681,48 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto src_access_lengths_and_vector_length = container_push_back( sequence_to_tuple_of_number(src_access_lengths), Number{}); + // 1st stage of transforms constexpr auto desc0 = make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length); -#if 0 - // TODO this is hardcoded for GEMM TN layout - // TODO implemenet the general logic - constexpr auto desc1 = transform_tensor_descriptor( - desc0, - make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]), - make_merge_transform_v3_division_mod( - make_tuple(src_access_lengths_and_vector_length[I1], - src_access_lengths_and_vector_length[I3])), - make_pass_through_transform(src_access_lengths_and_vector_length[I2])), - make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#else - // TODO this is hardcoded for NHWC backward-weight kernel - // TODO implemenet the general logic - constexpr auto desc1 = transform_tensor_descriptor( - desc0, - make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]), - make_pass_through_transform(src_access_lengths_and_vector_length[I1]), - make_merge_transform_v3_division_mod( - make_tuple(src_access_lengths_and_vector_length[I2], - src_access_lengths_and_vector_length[I4])), - make_pass_through_transform(src_access_lengths_and_vector_length[I3])), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); -#endif + // 2nd stage of transforms + constexpr auto transforms = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return make_merge_transform_v3_division_mod( + make_tuple(src_access_lengths_and_vector_length[i], + src_access_lengths_and_vector_length[Number{}])); + } + else + { + return make_pass_through_transform(src_access_lengths_and_vector_length[i]); + } + }, + Number{}); + + constexpr auto low_dim_idss = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return Sequence{}; + } + else + { + return Sequence{}; + } + }, + Number{}); + + constexpr auto up_dim_idss = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); - return desc1; + return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } __device__ static constexpr auto GetDstThreadScratchDescriptor() { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - + // 1st stage of transforms constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); @@ -742,34 +734,39 @@ struct ThreadwiseTensorSliceTransfer_v3r2 constexpr auto desc0 = make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length); -#if 0 - // TODO this is hardcoded for GEMM TN layout - // TODO implemenet the general logic - constexpr auto desc1 = transform_tensor_descriptor( - desc0, - make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]), - make_pass_through_transform(dst_access_lengths_and_vector_length[I1]), - make_merge_transform_v3_division_mod( - make_tuple(dst_access_lengths_and_vector_length[I2], - dst_access_lengths_and_vector_length[I3]))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#else - // TODO this is hardcoded for NHWC backward-weight kernel - // TODO implemenet the general logic - constexpr auto desc1 = transform_tensor_descriptor( - desc0, - make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]), - make_pass_through_transform(dst_access_lengths_and_vector_length[I1]), - make_pass_through_transform(dst_access_lengths_and_vector_length[I2]), - make_merge_transform_v3_division_mod( - make_tuple(dst_access_lengths_and_vector_length[I3], - dst_access_lengths_and_vector_length[I4]))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); -#endif + // 2nd stage of transforms + constexpr auto transforms = generate_tuple( + [&](auto i) { + if constexpr(i == DstVectorDim) + { + return make_merge_transform_v3_division_mod( + make_tuple(dst_access_lengths_and_vector_length[i], + dst_access_lengths_and_vector_length[Number{}])); + } + else + { + return make_pass_through_transform(dst_access_lengths_and_vector_length[i]); + } + }, + Number{}); + + constexpr auto low_dim_idss = generate_tuple( + [&](auto i) { + if constexpr(i == DstVectorDim) + { + return Sequence{}; + } + else + { + return Sequence{}; + } + }, + Number{}); + + constexpr auto up_dim_idss = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); - return desc1; + return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } private: diff --git a/host/driver_offline/src/gemm_driver_offline.cpp b/host/driver_offline/src/gemm_driver_offline.cpp index fec8170422c..e60b4905ae7 100644 --- a/host/driver_offline/src/gemm_driver_offline.cpp +++ b/host/driver_offline/src/gemm_driver_offline.cpp @@ -22,10 +22,10 @@ #include "device_gemm_xdlops_km_kn_nm.hpp" #include "device_gemm_xdlops_km_nk_nm.hpp" -#define USE_GEMM_XDL_MK_KN_MN 0 -#define USE_GEMM_XDL_MK_NK_MN 0 +#define USE_GEMM_XDL_MK_KN_MN 1 +#define USE_GEMM_XDL_MK_NK_MN 1 #define USE_GEMM_XDL_KM_KN_MN 1 -#define USE_GEMM_XDL_KM_NK_MN 0 +#define USE_GEMM_XDL_KM_NK_MN 1 #define USE_GEMM_XDL_MK_KN_NM 0 #define USE_GEMM_XDL_MK_NK_NM 0 #define USE_GEMM_XDL_KM_KN_NM 0 From 0112c76cc6c369aac4d9472a259546d5fcb6a2fe Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 20 Oct 2021 00:26:53 -0500 Subject: [PATCH 15/16] tweak --- .../threadwise_tensor_slice_transfer_v3r2.hpp | 7 +++++++ ...data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp index c906e3bf9e1..13214829f4e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp @@ -245,6 +245,12 @@ struct ThreadwiseTensorSliceTransfer_v3r2 __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch() { +#if 0 // debug + static_ford{}([&](auto idx) { + // convert from SrcData to DstData here + dst_thread_scratch_(idx) = type_convert{}(src_thread_scratch_[idx]); + }); +#else // TODO make this logic more generic // TODO missing type_convert !!!!!!!!!!!!! @@ -319,6 +325,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 src_vector_refs, dst_vector_refs); }); } +#endif } template diff --git a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp index d6955ec0005..e58fb08914c 100644 --- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp +++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp @@ -104,7 +104,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4; constexpr index_t GemmCThreadTransferDstScalarPerVector = 1; -#elif 1 +#elif 0 // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16 constexpr index_t BlockSize = 256; @@ -132,7 +132,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8; constexpr index_t GemmCThreadTransferDstScalarPerVector = 1; -#elif 0 +#elif 1 // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16 constexpr index_t BlockSize = 256; From b684ae4ebdfdc46afc56bed1173a85930e9bc765 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 15 Nov 2021 09:22:22 -0600 Subject: [PATCH 16/16] reorganize file --- .../{tensor_description => utility}/is_known_at_compile_time.hpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename composable_kernel/include/{tensor_description => utility}/is_known_at_compile_time.hpp (100%) diff --git a/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp b/composable_kernel/include/utility/is_known_at_compile_time.hpp similarity index 100% rename from composable_kernel/include/tensor_description/is_known_at_compile_time.hpp rename to composable_kernel/include/utility/is_known_at_compile_time.hpp