From 55dd2c844c3a3075cddf6abfddb698e73a27d569 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 13 Oct 2021 10:37:24 -0500
Subject: [PATCH 01/16] start fixing 16bit data packing

---
 .../threadwise_tensor_slice_transfer.hpp      | 582 ++++++++++++++++++
 composable_kernel/include/utility/config.hpp  |   2 +-
 .../src/gemm_driver_offline.cpp               |   6 +-
 3 files changed, 586 insertions(+), 4 deletions(-)
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 157828bf0fc..1f3dc8fe7ea 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -670,6 +670,587 @@ struct ThreadwiseTensorSliceTransfer_v2
     SrcCoord src_coord_;
 }; // namespace ck
 
+#if 0
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin,
+                                                          const DstDesc& dst_desc,
+                                                          const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to buffer_
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
+
+                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+
+            // copy data from buffer_ to dst_tmp_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
+            });
+
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
+
+            // copy data from dst_tmp_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+#else
 // Assume:
 //   1. src_desc and dst_desc are not known at compile-time
 //   2. SrcBuffer and DstBuffer are DynamicBuffer
@@ -1249,6 +1830,7 @@ struct ThreadwiseTensorSliceTransfer_v3
     SrcCoord src_coord_;
     DstCoord dst_coord_;
 };
+#endif
 
 // Assume:
 //   1. src:
diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp
index 5ee4bb9c642..62f92d1d5a4 100644
--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -94,7 +94,7 @@
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
 
 // merge transformation use magic number division
-#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
 
 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
diff --git a/host/driver_offline/src/gemm_driver_offline.cpp b/host/driver_offline/src/gemm_driver_offline.cpp
index e60b4905ae7..fec8170422c 100644
--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -22,10 +22,10 @@
 #include "device_gemm_xdlops_km_kn_nm.hpp"
 #include "device_gemm_xdlops_km_nk_nm.hpp"
 
-#define USE_GEMM_XDL_MK_KN_MN 1
-#define USE_GEMM_XDL_MK_NK_MN 1
+#define USE_GEMM_XDL_MK_KN_MN 0
+#define USE_GEMM_XDL_MK_NK_MN 0
 #define USE_GEMM_XDL_KM_KN_MN 1
-#define USE_GEMM_XDL_KM_NK_MN 1
+#define USE_GEMM_XDL_KM_NK_MN 0
 #define USE_GEMM_XDL_MK_KN_NM 0
 #define USE_GEMM_XDL_MK_NK_NM 0
 #define USE_GEMM_XDL_KM_KN_NM 0

From f513f88740f547f07abc70dcb52b1a71ac880e34 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 16 Oct 2021 12:56:56 -0500
Subject: [PATCH 02/16] adding StaticTensor

---
 .../is_known_at_compile_time.hpp              |  49 ++
 .../tensor_description/static_tensor.hpp      | 105 +++
 .../blockwise_tensor_slice_transfer.hpp       |  35 +-
 .../threadwise_tensor_slice_transfer.hpp      | 582 -----------------
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 607 ++++++++++++++++++
 .../include/utility/common_header.hpp         |   1 +
 .../include/utility/container_helper.hpp      |  13 -
 composable_kernel/include/utility/ignore.hpp  |  21 +
 .../include/utility/tuple_helper.hpp          |  16 -
 composable_kernel/include/utility/type.hpp    |  15 -
 10 files changed, 801 insertions(+), 643 deletions(-)
 create mode 100644 composable_kernel/include/tensor_description/is_known_at_compile_time.hpp
 create mode 100644 composable_kernel/include/tensor_description/static_tensor.hpp
 create mode 100644 composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
 create mode 100644 composable_kernel/include/utility/ignore.hpp

diff --git a/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp b/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp
new file mode 100644
index 00000000000..9dbe22f2eea
--- /dev/null
+++ b/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp
@@ -0,0 +1,49 @@
+#ifndef IS_KNOWN_AT_COMPILE_TIME_HPP
+#define IS_KNOWN_AT_COMPILE_TIME_HPP
+
+#include "config.hpp"
+#include "integral_constant.hpp"
+#include "sequence.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+template <typename T>
+struct is_known_at_compile_time;
+
+template <>
+struct is_known_at_compile_time<index_t>
+{
+    static constexpr bool value = false;
+};
+
+template <typename T, T X>
+struct is_known_at_compile_time<integral_constant<T, X>>
+{
+    static constexpr bool value = true;
+};
+
+template <index_t... Is>
+struct is_known_at_compile_time<Sequence<Is...>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename... Ts>
+struct is_known_at_compile_time<Tuple<Ts...>>
+{
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return container_reduce(
+            Tuple<Ts...>{},
+            [](auto x, bool r) {
+                return is_known_at_compile_time<remove_cvref_t<decltype(x)>>::value & r;
+            },
+            true);
+    }
+
+    static constexpr bool value = IsKnownAtCompileTime();
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp
new file mode 100644
index 00000000000..bdf98b7b702
--- /dev/null
+++ b/composable_kernel/include/tensor_description/static_tensor.hpp
@@ -0,0 +1,105 @@
+#ifndef CK_STATIC_TENSOR_HPP
+#define CK_STATIC_TENSOR_HPP
+
+#include "ignore.hpp"
+#include "static_buffer.hpp"
+
+namespace ck {
+
+template <AddressSpaceEnum_t AddressSpace,
+          typename T,
+          typename TensorDesc,
+          bool InvalidElementUseNumericalZeroValue,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct StaticTensor
+{
+    static constexpr index_t NDim         = TensorDesc::GetNumOfDimension();
+    static constexpr index_t buffer_size_ = TensorDesc{}.GetElementSpaceSize();
+
+    static constexpr auto desc_ = TensorDesc{};
+
+    using Buffer = StaticBuffer<AddressSpace, T, buffer_size_, InvalidElementUseNumericalZeroValue>;
+
+    __host__ __device__ constexpr StaticTensor() : invalid_element_value_{0} {}
+
+    __host__ __device__ constexpr StaticTensor(T invalid_element_value)
+        : invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == NDim,
+                                 bool>::type = false>
+    __host__ __device__ constexpr const T& operator[](Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return buffer_[Number<offset>{}];
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return T{0};
+            }
+            else
+            {
+                return invalid_element_value_;
+            }
+        }
+    }
+
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == NDim,
+                                 bool>::type = false>
+    __host__ __device__ T& operator()(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return buffer_(Number<offset>{});
+        }
+        else
+        {
+            return ignore;
+        }
+    }
+
+    Buffer buffer_;
+    T invalid_element_value_ = T{0};
+};
+
+template <AddressSpaceEnum_t AddressSpace,
+          typename T,
+          typename TensorDesc,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+__host__ __device__ constexpr auto make_static_tensor(TensorDesc)
+{
+    return StaticTensor<AddressSpace, T, TensorDesc, true>{};
+}
+
+template <
+    AddressSpaceEnum_t AddressSpace,
+    typename T,
+    typename TensorDesc,
+    typename X,
+    typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type                   = false,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+__host__ __device__ constexpr auto make_static_tensor(TensorDesc, X invalid_element_value)
+{
+    return StaticTensor<AddressSpace, T, TensorDesc, true>{invalid_element_value};
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index 0214b713522..e0330853a74 100644
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -5,7 +5,8 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
+//#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer_v3r2.hpp"
 
 namespace ck {
 
@@ -146,22 +147,22 @@ struct BlockwiseTensorSliceTransfer_v4
         make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
 
     using ThreadwiseTransfer =
-        ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
-                                         DstInMemOp,
-                                         SrcData,
-                                         DstData,
-                                         SrcDesc,
-                                         DstDesc,
-                                         SrcDimAccessOrder,
-                                         DstDimAccessOrder,
-                                         SrcVectorDim,
-                                         DstVectorDim,
-                                         SrcScalarPerVector,
-                                         DstScalarPerVector,
-                                         SrcScalarStrideInVector,
-                                         DstScalarStrideInVector,
-                                         ThreadTransferSrcResetCoordinateAfterRun,
-                                         ThreadTransferDstResetCoordinateAfterRun>;
+        ThreadwiseTensorSliceTransfer_v3r2<ThreadSliceLengths,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           SrcScalarStrideInVector,
+                                           DstScalarStrideInVector,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
 
     ThreadwiseTransfer threadwise_transfer_;
 };
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
index 1f3dc8fe7ea..157828bf0fc 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -670,587 +670,6 @@ struct ThreadwiseTensorSliceTransfer_v2
     SrcCoord src_coord_;
 }; // namespace ck
 
-#if 0
-// Assume:
-//   1. src_desc and dst_desc are not known at compile-time
-//   2. SrcBuffer and DstBuffer are DynamicBuffer
-//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
-//   4. Use thread buffer
-template <typename SliceLengths,
-          InMemoryDataOperationEnum_t DstInMemOp,
-          typename SrcData,
-          typename DstData,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorDim,
-          index_t DstVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstScalarPerVector,
-          index_t SrcScalarStrideInVector,
-          index_t DstScalarStrideInVector,
-          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
-                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
-                                           // save addr computation
-          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
-                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
-                                           // save addr computation
-struct ThreadwiseTensorSliceTransfer_v3
-{
-    static constexpr index_t nDim = SliceLengths::Size();
-    using Index                   = MultiIndex<nDim>;
-
-    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
-    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-
-    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
-                                                          const Index& src_slice_origin,
-                                                          const DstDesc& dst_desc,
-                                                          const Index& dst_slice_origin)
-        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
-          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
-    {
-        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
-    }
-
-    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
-    {
-        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
-    }
-
-    template <typename SrcBuffer, typename SrcStepHacks>
-    __device__ void
-    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
-    {
-        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
-            "wrong! SrcBuffer and SrcData data type are inconsistent");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_scalar_step_in_vector =
-            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // make forward steps
-        const auto src_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto src_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_src_access_idx[I0];
-
-                    static_for<0, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate src data index
-            constexpr auto src_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
-                                                      : ordered_src_access_lengths[i] - 1 -
-                                                            ordered_src_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                       src_scalar_per_access;
-            }();
-
-            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
-
-            using src_vector_t = typename decltype(src_tmp_vector)::type;
-
-            const bool is_src_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
-
-            // copy data from src_buf to src_tmp_vector
-            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
-                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
-
-            // copy data from src_tmp_vector to buffer_
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-                constexpr index_t buffer_offset =
-                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
-
-                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
-            });
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move src coordinate back to slice origin (or not)
-        if constexpr(SrcResetCoordinateAfterRun)
-        {
-            const auto src_reset_step =
-                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
-
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
-        }
-    }
-
-    template <typename DstBuffer, typename DstStepHacks>
-    __device__ void
-    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
-    {
-        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
-                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
-                      "wrong!");
-
-        static_assert(
-            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
-            "wrong! SrcBuffer or DstBuffer data type is wrong");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        // src scalar per access on each dim
-        // TODO: don't use this
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_scalar_step_in_vector =
-            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // make forward steps
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(
-                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
-            },
-            Number<nDim>{});
-
-        // loop over tensor and copy
-        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_dst_access_idx[I0];
-
-                    static_for<0, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
-                                                      : ordered_dst_access_lengths[i] - 1 -
-                                                            ordered_dst_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                       dst_scalar_per_access;
-            }();
-
-            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
-
-            // copy data from buffer_ to dst_tmp_vector
-            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-                constexpr index_t buffer_offset =
-                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
-
-                dst_tmp_vector.template AsType<DstData>()(i) =
-                    type_convert<DstData>{}(buffer_[Number<buffer_offset>{}]);
-            });
-
-            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
-
-            // copy data from dst_tmp_vector to dst_buf
-            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
-
-            dst_buf.template Set<dst_vector_t>(
-                dst_coord_.GetOffset(),
-                is_dst_valid,
-                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
-
-            constexpr auto move_on_dim = [&]() constexpr
-            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
-            }
-            ();
-
-            // move
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
-                    }
-                }
-            });
-        });
-
-        // move dst coordinate back to slice origin (or not)
-        if constexpr(DstResetCoordinateAfterRun)
-        {
-            const auto dst_reset_step =
-                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
-
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
-        }
-    }
-
-    template <typename SrcBuffer>
-    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
-    {
-        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
-
-        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
-
-        constexpr auto src_step_hacks =
-            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-
-        RunRead(src_desc, src_buf, src_step_hacks);
-    }
-
-    template <typename DstBuffer>
-    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
-    {
-        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
-
-        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
-
-        constexpr auto dst_step_hacks =
-            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
-                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
-
-        RunWrite(dst_desc, dst_buf, dst_step_hacks);
-    }
-
-    __device__ static constexpr auto GetSrcCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto src_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_src_access_lengths[I0] - 1;
-
-                static_for<0, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate src data index after last iteration in RunRead(), if it has not being reset by
-        // RunRead()
-        constexpr auto src_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                   src_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_src_data_step = [&]() {
-            Index reset_src_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
-
-            return reset_src_data_step_;
-        }();
-
-        return reset_src_data_step;
-    }
-
-    __device__ static constexpr auto GetDstCoordinateResetStep()
-    {
-        constexpr auto I0 = Number<0>{};
-
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
-        constexpr auto dst_scalar_per_access = generate_sequence(
-            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
-
-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_dst_access_lengths[I0] - 1;
-
-                static_for<0, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
-
-            return reset_dst_data_step_;
-        }();
-
-        return reset_dst_data_step;
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
-                                       const Index& src_slice_origin_step_idx)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-
-    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
-    template <typename SrcMoveSliceWindowStepHack>
-    __device__ void
-    MoveSrcSliceWindow(const SrcDesc& src_desc,
-                       const Index& src_slice_origin_step_idx,
-                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
-    {
-        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
-                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(
-            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
-
-        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
-    }
-    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
-    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
-                                       const Index& dst_slice_origin_step_idx)
-    {
-        // if dst coord was not reset by RunWrite(), then need to adjust the step here
-        const auto adjusted_step_idx =
-            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
-                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
-
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
-
-        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
-    }
-
-    private:
-    static constexpr auto buffer_desc_ =
-        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
-
-    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
-
-    StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
-
-    SrcCoord src_coord_;
-    DstCoord dst_coord_;
-};
-#else
 // Assume:
 //   1. src_desc and dst_desc are not known at compile-time
 //   2. SrcBuffer and DstBuffer are DynamicBuffer
@@ -1830,7 +1249,6 @@ struct ThreadwiseTensorSliceTransfer_v3
     SrcCoord src_coord_;
     DstCoord dst_coord_;
 };
-#endif
 
 // Assume:
 //   1. src:
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
new file mode 100644
index 00000000000..f46d99c1102
--- /dev/null
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -0,0 +1,607 @@
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R2_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "static_tensor.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum_t DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3r2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r2(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+#if 1 // debug
+            constexpr auto src_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
+#endif
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to thread_tensor_
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+#if 0 // debug
+                thread_tensor_(src_data_idx + i * src_scalar_step_in_vector) =
+                src_tmp_vector.template AsType<SrcData>()[i];
+#else
+                thread_tensor_(src_data_idx_seq + i * src_scalar_step_in_vector) =
+                    src_tmp_vector.template AsType<SrcData>()[i];
+#endif
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+#if 1 // debug
+            constexpr auto dst_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
+#endif
+
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+
+            // copy data from thread_tensor_ to dst_tmp_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+#if 0 // debug
+                dst_tmp_vector.template AsType<DstData>()(i) = type_convert<DstData>{}(
+                    thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]);
+#else
+                dst_tmp_vector.template AsType<DstData>()(i) = type_convert<DstData>{}(
+                    thread_tensor_[dst_data_idx_seq + i * dst_scalar_step_in_vector]);
+#endif
+            });
+
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
+
+            // copy data from dst_tmp_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto thread_tensor_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    StaticTensor<AddressSpaceEnum_t::Vgpr, SrcData, decltype(thread_tensor_desc_), true>
+        thread_tensor_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 85c02a1b99d..2947ec9c43c 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -31,6 +31,7 @@
 #include "amd_buffer_addressing.hpp"
 #include "static_buffer.hpp"
 #include "dynamic_buffer.hpp"
+#include "is_known_at_compile_time.hpp"
 
 #include "inner_product.hpp"
 
diff --git a/composable_kernel/include/utility/container_helper.hpp b/composable_kernel/include/utility/container_helper.hpp
index a7ed8ec059e..a92e79908d9 100644
--- a/composable_kernel/include/utility/container_helper.hpp
+++ b/composable_kernel/include/utility/container_helper.hpp
@@ -373,19 +373,6 @@ set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>&
     static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
 }
 
-template <typename Container>
-__host__ __device__ constexpr auto to_tuple_of_number(const Container&)
-{
-    static_assert(is_known_at_compile_time<Container>::value, "wrong!");
-
-    return generate_tuple(
-        [&](auto i) {
-            constexpr index_t tmp = Container::At(i);
-            return Number<tmp>{};
-        },
-        Container::Size());
-}
-
 template <index_t... Is>
 __host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
 {
diff --git a/composable_kernel/include/utility/ignore.hpp b/composable_kernel/include/utility/ignore.hpp
new file mode 100644
index 00000000000..8a199159b3e
--- /dev/null
+++ b/composable_kernel/include/utility/ignore.hpp
@@ -0,0 +1,21 @@
+#ifndef CK_IGNORE_HPP
+#define CK_IGNORE_HPP
+
+// https://en.cppreference.com/w/cpp/utility/tuple/ignore
+
+namespace ck {
+
+namespace detail {
+struct ignore_t
+{
+    template <typename T>
+    constexpr void operator=(T&&) const noexcept
+    {
+    }
+};
+} // namespace detail
+
+inline constexpr detail::ignore_t ignore;
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp
index 55a79d2594e..f568a9d824e 100644
--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
@@ -6,22 +6,6 @@
 
 namespace ck {
 
-template <typename... Ts>
-struct is_known_at_compile_time<Tuple<Ts...>>
-{
-    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
-    {
-        return container_reduce(
-            Tuple<Ts...>{},
-            [](auto x, bool r) {
-                return is_known_at_compile_time<remove_cvref_t<decltype(x)>>::value & r;
-            },
-            true);
-    }
-
-    static constexpr bool value = IsKnownAtCompileTime();
-};
-
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
 {
diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp
index 89a2bdbde63..8e2c2016733 100644
--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
@@ -28,21 +28,6 @@ using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
 template <typename T>
 inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
 
-template <typename T>
-struct is_known_at_compile_time;
-
-template <>
-struct is_known_at_compile_time<index_t>
-{
-    static constexpr bool value = false;
-};
-
-template <typename T, T X>
-struct is_known_at_compile_time<integral_constant<T, X>>
-{
-    static constexpr bool value = true;
-};
-
 template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
 __host__ __device__ constexpr Y as_type(X x)
 {

From 02a1b50cb9d694b0aff3690615b1263857530887 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 16 Oct 2021 22:10:54 -0500
Subject: [PATCH 03/16] adding StaticTensor

---
 .../tensor_description/static_tensor.hpp      | 103 +++++++++++--
 .../tensor_description/tensor_adaptor.hpp     |  14 ++
 .../blockwise_gemm_xdlops.hpp                 |   5 +-
 .../threadwise_tensor_slice_transfer_v3r2.hpp |  11 +-
 .../include/utility/common_header.hpp         |   2 +
 .../include/utility/static_buffer.hpp         | 145 +++++-------------
 .../static_buffer_of_vector_type_v2.hpp       | 100 ++++++++++++
 composable_kernel/include/utility/tuple.hpp   |   4 +
 ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp |   4 +-
 9 files changed, 266 insertions(+), 122 deletions(-)
 create mode 100644 composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp

diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp
index bdf98b7b702..447dca7c176 100644
--- a/composable_kernel/include/tensor_description/static_tensor.hpp
+++ b/composable_kernel/include/tensor_description/static_tensor.hpp
@@ -2,10 +2,10 @@
 #define CK_STATIC_TENSOR_HPP
 
 #include "ignore.hpp"
-#include "static_buffer.hpp"
 
 namespace ck {
 
+// StaticTensor for Scalar
 template <AddressSpaceEnum_t AddressSpace,
           typename T,
           typename TensorDesc,
@@ -13,12 +13,9 @@ template <AddressSpaceEnum_t AddressSpace,
           typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
 struct StaticTensor
 {
-    static constexpr index_t NDim         = TensorDesc::GetNumOfDimension();
-    static constexpr index_t buffer_size_ = TensorDesc{}.GetElementSpaceSize();
-
-    static constexpr auto desc_ = TensorDesc{};
-
-    using Buffer = StaticBuffer<AddressSpace, T, buffer_size_, InvalidElementUseNumericalZeroValue>;
+    static constexpr auto desc_                  = TensorDesc{};
+    static constexpr index_t ndim_               = TensorDesc::GetNumOfDimension();
+    static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize();
 
     __host__ __device__ constexpr StaticTensor() : invalid_element_value_{0} {}
 
@@ -27,8 +24,87 @@ struct StaticTensor
     {
     }
 
+    // read access
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr const T& operator[](Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_[Number<offset>{}];
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return T{0};
+            }
+            else
+            {
+                return invalid_element_value_;
+            }
+        }
+    }
+
+    // write access
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr T& operator()(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_(Number<offset>{});
+        }
+        else
+        {
+            return ignore;
+        }
+    }
+
+    StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
+    T invalid_element_value_ = T{0};
+};
+
+// StaticTensor for vector
+template <AddressSpaceEnum_t AddressSpace,
+          typename T,
+          index_t ScalarPerVector,
+          typename TensorDesc,
+          bool InvalidElementUseNumericalZeroValue,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct StaticTensorTupleOfVectorBuffer
+{
+    static constexpr auto desc_                  = TensorDesc{};
+    static constexpr index_t ndim_               = TensorDesc::GetNumOfDimension();
+    static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize();
+
+    static constexpr index_t num_of_vector_ =
+        math::integer_divide_ceil(element_space_size_, ScalarPerVector);
+
+    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {}
+
+    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(T invalid_element_value)
+        : invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    // read access
     template <typename Idx,
-              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == NDim,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
     __host__ __device__ constexpr const T& operator[](Idx) const
     {
@@ -40,7 +116,7 @@ struct StaticTensor
 
         if constexpr(is_valid)
         {
-            return buffer_[Number<offset>{}];
+            return data_[Number<offset>{}];
         }
         else
         {
@@ -55,10 +131,11 @@ struct StaticTensor
         }
     }
 
+    // write access
     template <typename Idx,
-              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == NDim,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
-    __host__ __device__ T& operator()(Idx)
+    __host__ __device__ constexpr T& operator()(Idx)
     {
         constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
 
@@ -68,7 +145,7 @@ struct StaticTensor
 
         if constexpr(is_valid)
         {
-            return buffer_(Number<offset>{});
+            return data_(Number<offset>{});
         }
         else
         {
@@ -76,7 +153,7 @@ struct StaticTensor
         }
     }
 
-    Buffer buffer_;
+    StaticBufferTupleOfVector<AddressSpace, T, num_of_vector_, ScalarPerVector, true> data_;
     T invalid_element_value_ = T{0};
 };
 
diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
index 50a8088bbab..8787abd6ba6 100644
--- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -151,6 +151,20 @@ struct TensorAdaptor
 
     __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
 
+#if 0 // debug
+    template <index_t I>
+    __host__ __device__ constexpr index_t GetTopDimensionLength(Number<I> idim) const
+    {
+        // TODO: not implemented
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr index_t GetBottomDimensionLength(Number<I> idim) const
+    {
+        // TODO: not implemented
+    }
+#endif
+
     template <typename TopIdx>
     __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
     {
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
index 36c67832042..cb0aa1c326c 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -37,7 +37,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
     static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
     static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
 
-    StaticBufferV2<AddressSpaceEnum_t::Vgpr, vector_type<FloatAcc, 16>, MRepeat * NRepeat, true>
+    StaticBufferOfVectorTypeV2<AddressSpaceEnum_t::Vgpr,
+                               vector_type<FloatAcc, 16>,
+                               MRepeat * NRepeat,
+                               true>
         c_thread_buf_;
 
     __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index f46d99c1102..2e9c8c09f6a 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -330,7 +330,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
             // copy data from thread_tensor_ to dst_tmp_vector
             static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-#if 0 // debug
+#if 0
                 dst_tmp_vector.template AsType<DstData>()(i) = type_convert<DstData>{}(
                     thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]);
 #else
@@ -596,8 +596,17 @@ struct ThreadwiseTensorSliceTransfer_v3r2
     static constexpr auto thread_tensor_desc_ =
         make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
+#if 0
     StaticTensor<AddressSpaceEnum_t::Vgpr, SrcData, decltype(thread_tensor_desc_), true>
         thread_tensor_;
+#else
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+                                    SrcData,
+                                    1,
+                                    decltype(thread_tensor_desc_),
+                                    true>
+        thread_tensor_;
+#endif
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 2947ec9c43c..7c9a6b60c00 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -30,6 +30,8 @@
 #include "amd_address_space.hpp"
 #include "amd_buffer_addressing.hpp"
 #include "static_buffer.hpp"
+// TODO remove this
+#include "static_buffer_of_vector_type_v2.hpp"
 #include "dynamic_buffer.hpp"
 #include "is_known_at_compile_time.hpp"
 
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index 9615d10c597..97cd80451df 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -5,158 +5,93 @@
 
 namespace ck {
 
-template <AddressSpaceEnum_t BufferAddressSpace,
+// static buffer for scalar
+template <AddressSpaceEnum_t AddressSpace,
           typename T,
           index_t N,
-          bool InvalidElementUseNumericalZeroValue>
+          bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
 struct StaticBuffer : public StaticallyIndexedArray<T, N>
 {
     using type = T;
     using base = StaticallyIndexedArray<T, N>;
 
-    T invalid_element_value_ = T{0};
-
     __host__ __device__ constexpr StaticBuffer() : base{} {}
 
-    __host__ __device__ constexpr StaticBuffer(T invalid_element_value)
-        : base{}, invalid_element_value_{invalid_element_value}
-    {
-    }
-
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
-        return BufferAddressSpace;
+        return AddressSpace;
     }
 
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+
+    // read access
     template <index_t I>
-    __host__ __device__ constexpr auto Get(Number<I> i, bool is_valid_element) const
+    __host__ __device__ constexpr const T& operator[](Number<I> i) const
     {
-        if constexpr(InvalidElementUseNumericalZeroValue)
-        {
-            return is_valid_element ? At(i) : T{0};
-        }
-        else
-        {
-            return is_valid_element ? At(i) : invalid_element_value_;
-        }
+        return base::operator[](i);
     }
 
+    // write access
     template <index_t I>
-    __host__ __device__ void Set(Number<I> i, bool is_valid_element, const T& x)
+    __host__ __device__ constexpr T& operator()(Number<I> i)
     {
-        if(is_valid_element)
-        {
-            At(i) = x;
-        }
+        return base::operator()(i);
     }
-
-    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
-
-    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace,
+// static buffer for vector
+template <AddressSpaceEnum_t AddressSpace,
           typename T,
-          index_t N,
-          bool InvalidElementUseNumericalZeroValue>
-struct StaticBufferV2 : public StaticallyIndexedArray<T, N>
+          index_t NumOfVector,
+          index_t ScalarPerVector,
+          bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
+struct StaticBufferTupleOfVector
+    : public StaticallyIndexedArray<vector_type<T, ScalarPerVector>, NumOfVector>
 {
     using type = T;
-    using base = StaticallyIndexedArray<T, N>;
-
-    using VecBaseType = typename T::d1_t;
-
-    __host__ __device__ static constexpr index_t GetVectorSize()
-    {
-        return sizeof(typename T::type) / sizeof(VecBaseType);
-    }
+    using base = StaticallyIndexedArray<vector_type<T, ScalarPerVector>, NumOfVector>;
 
-    static constexpr index_t vector_size = GetVectorSize();
+    static constexpr auto scalar_per_vector = Number<ScalarPerVector>{};
+    static constexpr auto num_of_vector_    = Number<NumOfVector>{};
 
-    VecBaseType invalid_element_value_ = VecBaseType{0};
-
-    T invalid_vec_value_ = T{0};
-
-    __host__ __device__ constexpr StaticBufferV2() : base{} {}
-
-    __host__ __device__ constexpr StaticBufferV2(VecBaseType invalid_element_value)
-        : base{},
-          invalid_vec_value_{invalid_element_value},
-          invalid_element_value_{invalid_element_value}
-    {
-    }
+    __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
 
     __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
     {
-        return BufferAddressSpace;
+        return AddressSpace;
     }
 
-    template <index_t I>
-    __host__ __device__ constexpr auto& GetVector(Number<I> vec_id)
-    {
-        return this->At(vec_id);
-    }
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
 
-    template <index_t I>
-    __host__ __device__ constexpr const auto& GetVector(Number<I> vec_id) const
-    {
-        return this->At(vec_id);
-    }
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 
+    // read access
     template <index_t I>
-    __host__ __device__ constexpr auto& GetElement(Number<I> i, bool)
+    __host__ __device__ constexpr const T& operator[](Number<I> i) const
     {
-        constexpr auto vec_id  = Number<i / vector_size>{};
-        constexpr auto vec_off = Number<i % vector_size>{};
+        constexpr auto vector_i = i / scalar_per_vector;
+        constexpr auto scalar_i = i % scalar_per_vector;
 
-        return this->At(vec_id).template AsType<VecBaseType>()(vec_off);
-    }
-
-    template <index_t I>
-    __host__ __device__ constexpr auto GetElement(Number<I> i, bool is_valid_element) const
-    {
-        constexpr auto vec_id  = Number<i / vector_size>{};
-        constexpr auto vec_off = Number<i % vector_size>{};
-
-        if constexpr(InvalidElementUseNumericalZeroValue)
-        {
-            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
-                                    : VecBaseType{0};
-        }
-        else
-        {
-            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
-                                    : invalid_element_value_;
-        }
+        return base::operator[](vector_i).template AsType<T>()[scalar_i];
     }
 
+    // write access
     template <index_t I>
-    __host__ __device__ constexpr auto operator[](Number<I> i) const
+    __host__ __device__ constexpr T& operator()(Number<I> i)
     {
-        return GetElement(i, true);
-    }
+        constexpr auto vector_i = i / scalar_per_vector;
+        constexpr auto scalar_i = i % scalar_per_vector;
 
-    template <index_t I>
-    __host__ __device__ constexpr auto& operator()(Number<I> i)
-    {
-        return GetElement(i, true);
+        return base::operator()(vector_i).template AsType<T>()(scalar_i);
     }
-
-    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
-
-    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };
 
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
+template <AddressSpaceEnum_t AddressSpace, typename T, index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {
-    return StaticBuffer<BufferAddressSpace, T, N, true>{};
-}
-
-template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
-__host__ __device__ constexpr auto make_static_buffer(Number<N>, T invalid_element_value)
-{
-    return StaticBuffer<BufferAddressSpace, T, N, false>{invalid_element_value};
+    return StaticBuffer<AddressSpace, T, N, true>{};
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp b/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp
new file mode 100644
index 00000000000..ed3ae201fcc
--- /dev/null
+++ b/composable_kernel/include/utility/static_buffer_of_vector_type_v2.hpp
@@ -0,0 +1,100 @@
+#ifndef CK_STATIC_BUFFER_OF_VECTOR_TYPE_V2_HPP
+#define CK_STATIC_BUFFER_OF_VECTOR_TYPE_V2_HPP
+
+#include "statically_indexed_array.hpp"
+
+namespace ck {
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          index_t N,
+          bool InvalidElementUseNumericalZeroValue>
+struct StaticBufferOfVectorTypeV2 : public StaticallyIndexedArray<T, N>
+{
+    using type = T;
+    using base = StaticallyIndexedArray<T, N>;
+
+    using VecBaseType = typename T::d1_t;
+
+    __host__ __device__ static constexpr index_t GetVectorSize()
+    {
+        return sizeof(typename T::type) / sizeof(VecBaseType);
+    }
+
+    static constexpr index_t vector_size = GetVectorSize();
+
+    VecBaseType invalid_element_value_ = VecBaseType{0};
+
+    T invalid_vec_value_ = T{0};
+
+    __host__ __device__ constexpr StaticBufferOfVectorTypeV2() : base{} {}
+
+    __host__ __device__ constexpr StaticBufferOfVectorTypeV2(VecBaseType invalid_element_value)
+        : base{},
+          invalid_vec_value_{invalid_element_value},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
+    {
+        return BufferAddressSpace;
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetVector(Number<I> vec_id)
+    {
+        return this->At(vec_id);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetVector(Number<I> vec_id) const
+    {
+        return this->At(vec_id);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetElement(Number<I> i, bool)
+    {
+        constexpr auto vec_id  = Number<i / vector_size>{};
+        constexpr auto vec_off = Number<i % vector_size>{};
+
+        return this->At(vec_id).template AsType<VecBaseType>()(vec_off);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto GetElement(Number<I> i, bool is_valid_element) const
+    {
+        constexpr auto vec_id  = Number<i / vector_size>{};
+        constexpr auto vec_off = Number<i % vector_size>{};
+
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
+                                    : VecBaseType{0};
+        }
+        else
+        {
+            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
+                                    : invalid_element_value_;
+        }
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto operator[](Number<I> i) const
+    {
+        return GetElement(i, true);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return GetElement(i, true);
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp
index 70f4d77d874..3dea944f6d1 100644
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -117,6 +117,7 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
 
     __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
 
+    // read access
     template <index_t I>
     __host__ __device__ constexpr const auto& At(Number<I>) const
     {
@@ -124,6 +125,7 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
         return base::GetElementByKey(detail::TupleElementKey<I>{});
     }
 
+    // write access
     template <index_t I>
     __host__ __device__ constexpr auto& At(Number<I>)
     {
@@ -131,12 +133,14 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
         return base::GetElementByKey(detail::TupleElementKey<I>{});
     }
 
+    // read access
     template <index_t I>
     __host__ __device__ constexpr const auto& operator[](Number<I> i) const
     {
         return At(i);
     }
 
+    // write access
     template <index_t I>
     __host__ __device__ constexpr auto& operator()(Number<I> i)
     {
diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 01e5c57ab41..1b23aa1a8c9 100644
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -160,7 +160,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
+#elif 1
     // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
     constexpr index_t BlockSize = 256;
 
@@ -188,7 +188,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
+#elif 0
     // [M, N, K0, K1] = [128, 128, 4, 8], C = 64, for fp16
     constexpr index_t BlockSize = 256;
 

From 61fe6c9602bef4e21e64ea3c7969a44db9a8d4d5 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sun, 17 Oct 2021 21:08:16 -0500
Subject: [PATCH 04/16] adding StaticTensor

---
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 18 ++++-
 .../include/utility/data_type.hpp             | 12 ++++
 .../include/utility/static_buffer.hpp         | 67 ++++++++++++++-----
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index 2e9c8c09f6a..efc03dbe8d2 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -592,14 +592,30 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
+    __device__ constexpr auto GetSrcThreadBufferDescriptor()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+    }
+
     private:
+#if 0 // debug
     static constexpr auto thread_tensor_desc_ =
         make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
 
-#if 0
     StaticTensor<AddressSpaceEnum_t::Vgpr, SrcData, decltype(thread_tensor_desc_), true>
         thread_tensor_;
 #else
+    static constexpr auto thread_tensor_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
     StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
                                     SrcData,
                                     1,
diff --git a/composable_kernel/include/utility/data_type.hpp b/composable_kernel/include/utility/data_type.hpp
index 07eceb84cff..cc5ee0de0ea 100644
--- a/composable_kernel/include/utility/data_type.hpp
+++ b/composable_kernel/include/utility/data_type.hpp
@@ -58,6 +58,18 @@ __host__ __device__ constexpr auto make_vector_type(Number<N>)
 template <typename TV>
 struct scalar_type;
 
+// is_scalar_type
+template <typename TV>
+struct is_scalar_type
+{
+    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
+};
+
+// has_same_scalar_type
+template <typename X, typename Y>
+using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+
 template <typename T, index_t N>
 struct scalar_type<T __attribute__((ext_vector_type(N)))>
 {
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index 97cd80451df..9692e5a8287 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -43,18 +43,19 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
 
 // static buffer for vector
 template <AddressSpaceEnum_t AddressSpace,
-          typename T,
+          typename S,
           index_t NumOfVector,
           index_t ScalarPerVector,
-          bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
+          bool InvalidElementUseNumericalZeroValue, // TODO remove this bool, no longer needed,
+          typename enable_if<is_scalar_type<S>::value, bool>::type = false>
 struct StaticBufferTupleOfVector
-    : public StaticallyIndexedArray<vector_type<T, ScalarPerVector>, NumOfVector>
+    : public StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>
 {
-    using type = T;
-    using base = StaticallyIndexedArray<vector_type<T, ScalarPerVector>, NumOfVector>;
+    using V    = typename vector_type<S, ScalarPerVector>::type;
+    using base = StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>;
 
-    static constexpr auto scalar_per_vector = Number<ScalarPerVector>{};
-    static constexpr auto num_of_vector_    = Number<NumOfVector>{};
+    static constexpr auto s_per_v   = Number<ScalarPerVector>{};
+    static constexpr auto num_of_v_ = Number<NumOfVector>{};
 
     __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
 
@@ -67,24 +68,56 @@ struct StaticBufferTupleOfVector
 
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 
-    // read access
+    // read access of S
     template <index_t I>
-    __host__ __device__ constexpr const T& operator[](Number<I> i) const
+    __host__ __device__ constexpr const S& operator[](Number<I> i) const
     {
-        constexpr auto vector_i = i / scalar_per_vector;
-        constexpr auto scalar_i = i % scalar_per_vector;
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_s = i % s_per_v;
 
-        return base::operator[](vector_i).template AsType<T>()[scalar_i];
+        return base::operator[](i_v).template AsType<S>()[i_s];
     }
 
-    // write access
+    // write access of S
     template <index_t I>
-    __host__ __device__ constexpr T& operator()(Number<I> i)
+    __host__ __device__ constexpr S& operator()(Number<I> i)
     {
-        constexpr auto vector_i = i / scalar_per_vector;
-        constexpr auto scalar_i = i % scalar_per_vector;
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_s = i % s_per_v;
+
+        return base::operator()(i_v).template AsType<S>()(i_s);
+    }
+
+    // read access of X
+    template <typename X,
+              index_t I,
+              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+    __host__ __device__ constexpr auto GetAsType(Number<I> i) const
+    {
+        constexpr index_t s_per_x = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(s_per_v % s_per_x == 0, "wrong! V must  one or multiple X");
+
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_x = (i % s_per_v) / s_per_x;
+
+        return base::operator[](i_v).template AsType<X>()[i_x];
+    }
+
+    // write access of X
+    template <typename X,
+              index_t I,
+              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+    __host__ __device__ constexpr void SetAsType(Number<I> i, X x)
+    {
+        constexpr index_t s_per_x = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X");
+
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_x = (i % s_per_v) / s_per_x;
 
-        return base::operator()(vector_i).template AsType<T>()(scalar_i);
+        base::operator()(i_v).template AsType<X>()(i_x) = x;
     }
 };
 

From 7c898b2a42577af095e05b0217dc46c286e40330 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sun, 17 Oct 2021 22:57:13 -0500
Subject: [PATCH 05/16] add missing constexpr

---
 .../include/tensor_description/multi_index_transform.hpp    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp
index 1a25e99f3bb..248148686bc 100644
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -30,7 +30,8 @@ struct PassThrough
     __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
 
     template <typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
+    __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                                  const UpIdx& idx_up)
     {
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");
@@ -1708,7 +1709,8 @@ struct Vectorize
     __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
 
     template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
     {
         static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                       "wrong! inconsistent # of dimension");

From ba8564641fbac87a8fa824c6cce97e341c1eee5a Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sun, 17 Oct 2021 23:53:45 -0500
Subject: [PATCH 06/16] adding static tensor

---
 .../gridwise_gemm_xdlops_v2r3.hpp             |  98 +++++++-------
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 126 +++++++++++++-----
 2 files changed, 144 insertions(+), 80 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
index e3b0054bec2..c1df3961d35 100644
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -397,56 +397,58 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         }();
 
         // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<KPerBlock, MPerBlock, K1>,
-                                            ABlockTransferThreadSliceLengths_K0_M_K1,
-                                            ABlockTransferThreadClusterLengths_K0_M_K1,
-                                            ABlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(a_k0_m_k1_grid_desc),
-                                            decltype(a_k0_m_k1_block_desc),
-                                            ABlockTransferSrcAccessOrder,
-                                            Sequence<1, 0, 2>,
-                                            ABlockTransferSrcVectorDim,
-                                            2,
-                                            ABlockTransferSrcScalarPerVector,
-                                            ABlockTransferDstScalarPerVector_K1,
-                                            1,
-                                            1,
-                                            AThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(a_k0_m_k1_grid_desc,
-                                                  make_multi_index(0, m_block_data_idx_on_grid, 0),
-                                                  a_k0_m_k1_block_desc,
-                                                  make_multi_index(0, 0, 0));
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, MPerBlock, K1>,
+            ABlockTransferThreadSliceLengths_K0_M_K1,
+            ABlockTransferThreadClusterLengths_K0_M_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_k0_m_k1_grid_desc),
+            decltype(a_k0_m_k1_block_desc),
+            ABlockTransferSrcAccessOrder,
+            Sequence<1, 0, 2>, // TODO: maybe ABlockTransferDstAccessOrder should be the same as
+                               // ABlockTransferSrcAccessOrder?
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_K1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true>(a_k0_m_k1_grid_desc,
+                  make_multi_index(0, m_block_data_idx_on_grid, 0),
+                  a_k0_m_k1_block_desc,
+                  make_multi_index(0, 0, 0));
 
         // B matrix blockwise copy
-        auto b_blockwise_copy =
-            BlockwiseTensorSliceTransfer_v4<BlockSize,
-                                            InMemoryDataOperationEnum_t::Set,
-                                            Sequence<KPerBlock, NPerBlock, K1>,
-                                            BBlockTransferThreadSliceLengths_K0_N_K1,
-                                            BBlockTransferThreadClusterLengths_K0_N_K1,
-                                            BBlockTransferThreadClusterArrangeOrder,
-                                            FloatAB,
-                                            FloatAB,
-                                            decltype(b_k0_n_k1_grid_desc),
-                                            decltype(b_k0_n_k1_block_desc),
-                                            BBlockTransferSrcAccessOrder,
-                                            Sequence<1, 0, 2>,
-                                            BBlockTransferSrcVectorDim,
-                                            2,
-                                            BBlockTransferSrcScalarPerVector,
-                                            BBlockTransferDstScalarPerVector_K1,
-                                            1,
-                                            1,
-                                            BThreadTransferSrcResetCoordinateAfterRun,
-                                            true>(b_k0_n_k1_grid_desc,
-                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
-                                                  b_k0_n_k1_block_desc,
-                                                  make_multi_index(0, 0, 0));
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4<
+            BlockSize,
+            InMemoryDataOperationEnum_t::Set,
+            Sequence<KPerBlock, NPerBlock, K1>,
+            BBlockTransferThreadSliceLengths_K0_N_K1,
+            BBlockTransferThreadClusterLengths_K0_N_K1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_k0_n_k1_grid_desc),
+            decltype(b_k0_n_k1_block_desc),
+            BBlockTransferSrcAccessOrder,
+            Sequence<1, 0, 2>, // TODO: maybe BBlockTransferDstAccessOrder should be the same as
+                               // BBlockTransferSrcAccessOrder?
+            BBlockTransferSrcVectorDim,
+            2,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_K1,
+            1,
+            1,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_k0_n_k1_grid_desc,
+                  make_multi_index(0, n_block_data_idx_on_grid, 0),
+                  b_k0_n_k1_block_desc,
+                  make_multi_index(0, 0, 0));
 
         // GEMM definition
         //   c_mtx += transpose(a_mtx) * b_mtx
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index efc03dbe8d2..3579c10418e 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -172,13 +172,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2
             src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
                 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
 
-            // copy data from src_tmp_vector to thread_tensor_
+            // copy data from src_tmp_vector to src_thread_scratch_
             static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-#if 0 // debug
-                thread_tensor_(src_data_idx + i * src_scalar_step_in_vector) =
-                src_tmp_vector.template AsType<SrcData>()[i];
-#else
-                thread_tensor_(src_data_idx_seq + i * src_scalar_step_in_vector) =
+#if 1 // debug
+                src_thread_scratch_(src_data_idx_seq + i * src_scalar_step_in_vector) =
                     src_tmp_vector.template AsType<SrcData>()[i];
 #endif
             });
@@ -228,10 +225,24 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         }
     }
 
+    __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
+    {
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // convert from SrcData to DstData here
+            dst_thread_scratch_(idx) = type_convert<DstData>{}(src_thread_scratch_[idx]);
+        });
+    }
+
     template <typename DstBuffer, typename DstStepHacks>
     __device__ void
     RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
+#if 1 // debug
+      // if there is transpose, it's done here
+      // TODO move this elsewhere
+        TransferDataFromSrcThreadScratchToDstThreadScratch();
+#endif
+
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
                       "wrong!");
@@ -328,15 +339,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
             vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
 
-            // copy data from thread_tensor_ to dst_tmp_vector
+            // copy data from dst_thread_scratch_ to dst_tmp_vector
             static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-#if 0
-                dst_tmp_vector.template AsType<DstData>()(i) = type_convert<DstData>{}(
-                    thread_tensor_[dst_data_idx + i * dst_scalar_step_in_vector]);
-#else
-                dst_tmp_vector.template AsType<DstData>()(i) = type_convert<DstData>{}(
-                    thread_tensor_[dst_data_idx_seq + i * dst_scalar_step_in_vector]);
-#endif
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    dst_thread_scratch_[dst_data_idx_seq + i * dst_scalar_step_in_vector];
             });
 
             using dst_vector_t = typename decltype(dst_tmp_vector)::type;
@@ -592,36 +598,92 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
-    __device__ constexpr auto GetSrcThreadBufferDescriptor()
+#if 1 // debug
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
     {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
         constexpr auto src_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
 
-        constexpr auto src_scalar_step_in_vector =
-            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight
+        // TODO implemenet the general logic
+        constexpr auto desc1 = transform_tensor_descriptor(
+            desc0,
+            make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(src_access_lengths_and_vector_length[I1],
+                                      src_access_lengths_and_vector_length[I3])),
+                       make_pass_through_transform(src_access_lengths_and_vector_length[I2])),
+            make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        return desc1;
     }
 
-    private:
-#if 0 // debug
-    static constexpr auto thread_tensor_desc_ =
-        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
-    StaticTensor<AddressSpaceEnum_t::Vgpr, SrcData, decltype(thread_tensor_desc_), true>
-        thread_tensor_;
-#else
-    static constexpr auto thread_tensor_desc_ =
-        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight
+        // TODO implemenet the general logic
+        constexpr auto desc1 = transform_tensor_descriptor(
+            desc0,
+            make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]),
+                       make_pass_through_transform(dst_access_lengths_and_vector_length[I1]),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(dst_access_lengths_and_vector_length[I2],
+                                      dst_access_lengths_and_vector_length[I3]))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        return desc1;
+    }
+#endif
+
+    private:
+#if 1 // debug
+    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
+    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
 
     StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
                                     SrcData,
-                                    1,
-                                    decltype(thread_tensor_desc_),
+                                    SrcScalarPerVector,
+                                    decltype(src_thread_scratch_desc_),
+                                    true>
+        src_thread_scratch_;
+
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
+                                    DstData,
+                                    DstScalarPerVector,
+                                    decltype(src_thread_scratch_desc_),
                                     true>
-        thread_tensor_;
+        dst_thread_scratch_;
 #endif
 
     SrcCoord src_coord_;

From ba74c9ff275c0d389466c01d532a3414d7e748d2 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 18 Oct 2021 00:49:56 -0500
Subject: [PATCH 07/16] adding static tensor

---
 .../tensor_description/static_tensor.hpp      | 71 ++++++++++++++++---
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 45 +++---------
 .../include/utility/static_buffer.hpp         |  4 +-
 3 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp
index 447dca7c176..9bf9b13daf9 100644
--- a/composable_kernel/include/tensor_description/static_tensor.hpp
+++ b/composable_kernel/include/tensor_description/static_tensor.hpp
@@ -81,7 +81,7 @@ struct StaticTensor
 
 // StaticTensor for vector
 template <AddressSpaceEnum_t AddressSpace,
-          typename T,
+          typename S,
           index_t ScalarPerVector,
           typename TensorDesc,
           bool InvalidElementUseNumericalZeroValue,
@@ -97,16 +97,16 @@ struct StaticTensorTupleOfVectorBuffer
 
     __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {}
 
-    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(T invalid_element_value)
+    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
         : invalid_element_value_{invalid_element_value}
     {
     }
 
-    // read access
+    // read access of S
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
-    __host__ __device__ constexpr const T& operator[](Idx) const
+    __host__ __device__ constexpr const S& operator[](Idx) const
     {
         constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
 
@@ -122,7 +122,7 @@ struct StaticTensorTupleOfVectorBuffer
         {
             if constexpr(InvalidElementUseNumericalZeroValue)
             {
-                return T{0};
+                return S{0};
             }
             else
             {
@@ -131,11 +131,11 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
-    // write access
+    // write access of S
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
-    __host__ __device__ constexpr T& operator()(Idx)
+    __host__ __device__ constexpr S& operator()(Idx)
     {
         constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
 
@@ -153,8 +153,61 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
-    StaticBufferTupleOfVector<AddressSpace, T, num_of_vector_, ScalarPerVector, true> data_;
-    T invalid_element_value_ = T{0};
+    // read access of X
+    template <typename X,
+              typename Idx,
+              typename enable_if<has_same_scalar_type<S, X>::value &&
+                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr X GetAsType(Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_.template GetAsType<X>(Number<offset>{});
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                // TODO: is this right way to initialize a vector?
+                return X{0};
+            }
+            else
+            {
+                // TODO: is this right way to initialize a vector?
+                return X{invalid_element_value_};
+            }
+        }
+    }
+
+    // write access of X
+    template <typename X,
+              typename Idx,
+              typename enable_if<has_same_scalar_type<S, X>::value &&
+                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr void SetAsType(Idx, X x)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            data_.template SetAsType<X>(Number<offset>{}, x);
+        }
+    }
+
+    StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
+    S invalid_element_value_ = S{0};
 };
 
 template <AddressSpaceEnum_t AddressSpace,
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index 3579c10418e..736942e23ec 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -83,9 +83,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto src_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
 
-        constexpr auto src_scalar_step_in_vector =
-            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
-
         constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
 
         constexpr auto src_dim_access_order = SrcDimAccessOrder{};
@@ -159,26 +156,17 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 #if 1 // debug
             constexpr auto src_data_idx_seq = generate_sequence_v2(
                 [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
-#endif
-
-            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
-
-            using src_vector_t = typename decltype(src_tmp_vector)::type;
 
             const bool is_src_valid =
                 coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
 
-            // copy data from src_buf to src_tmp_vector
-            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
-                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+            using src_vector_t = typename vector_type_maker_t<SrcData, SrcScalarPerVector>::type;
 
-            // copy data from src_tmp_vector to src_thread_scratch_
-            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
-#if 1 // debug
-                src_thread_scratch_(src_data_idx_seq + i * src_scalar_step_in_vector) =
-                    src_tmp_vector.template AsType<SrcData>()[i];
+            // copy data from src_buf to src_thread_scratch_
+            src_thread_scratch_.template SetAsType<src_vector_t>(
+                src_data_idx_seq,
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid));
 #endif
-            });
 
             constexpr auto move_on_dim = [&]() constexpr
             {
@@ -259,9 +247,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto dst_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
 
-        constexpr auto dst_scalar_step_in_vector =
-            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
-
         constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
 
         constexpr auto dst_dim_access_order = DstDimAccessOrder{};
@@ -335,26 +320,18 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 #if 1 // debug
             constexpr auto dst_data_idx_seq = generate_sequence_v2(
                 [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
-#endif
 
-            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
-
-            // copy data from dst_thread_scratch_ to dst_tmp_vector
-            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
-                dst_tmp_vector.template AsType<DstData>()(i) =
-                    dst_thread_scratch_[dst_data_idx_seq + i * dst_scalar_step_in_vector];
-            });
-
-            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
-
-            // copy data from dst_tmp_vector to dst_buf
             const bool is_dst_valid =
                 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
 
+            using dst_vector_t = typename vector_type_maker_t<DstData, DstScalarPerVector>::type;
+
+            // copy data from dst_thread_scratch_ to dst_buf
             dst_buf.template Set<dst_vector_t>(
                 dst_coord_.GetOffset(),
                 is_dst_valid,
-                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq));
+#endif
 
             constexpr auto move_on_dim = [&]() constexpr
             {
@@ -681,7 +658,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
     StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
                                     DstData,
                                     DstScalarPerVector,
-                                    decltype(src_thread_scratch_desc_),
+                                    decltype(dst_thread_scratch_desc_),
                                     true>
         dst_thread_scratch_;
 #endif
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index 9692e5a8287..62cfb2f492c 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -94,7 +94,7 @@ struct StaticBufferTupleOfVector
               typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
     __host__ __device__ constexpr auto GetAsType(Number<I> i) const
     {
-        constexpr index_t s_per_x = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
 
         static_assert(s_per_v % s_per_x == 0, "wrong! V must  one or multiple X");
 
@@ -110,7 +110,7 @@ struct StaticBufferTupleOfVector
               typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
     __host__ __device__ constexpr void SetAsType(Number<I> i, X x)
     {
-        constexpr index_t s_per_x = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
 
         static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X");
 

From 977185e685d952488e3ae252a54c4a3013397432 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 18 Oct 2021 03:29:25 -0500
Subject: [PATCH 08/16] adding transpose

---
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 107 +++++++++++++++++-
 .../include/utility/common_header.hpp         |   1 +
 .../include/utility/transpose_vectors.hpp     |  60 ++++++++++
 .../include/device_gemm_xdlops_km_kn_mn.hpp   |   6 +-
 4 files changed, 169 insertions(+), 5 deletions(-)
 create mode 100644 composable_kernel/include/utility/transpose_vectors.hpp

diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index 736942e23ec..da1c2d3eece 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -8,6 +8,38 @@
 
 namespace ck {
 
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector>
+struct lambda_scalar_per_access_for_src_and_dst
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        if(i == SrcVectorDim && i == DstVectorDim)
+        {
+            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
+        }
+        else if(i == SrcVectorDim)
+        {
+            return SrcScalarPerVector;
+        }
+        else if(i == DstVectorDim)
+        {
+            return DstScalarPerVector;
+        }
+        else
+        {
+            return 1;
+        }
+    }
+};
+
+} // namespace detail
+
 // Assume:
 //   1. src_desc and dst_desc are not known at compile-time
 //   2. SrcBuffer and DstBuffer are DynamicBuffer
@@ -185,7 +217,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
             }
             ();
 
-            // move
+            // move src coord
             static_for<0, nDim, 1>{}([&](auto i) {
                 if constexpr(move_on_dim[i])
                 {
@@ -215,10 +247,81 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
     __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
     {
+#if 0 // debug
         static_ford<SliceLengths>{}([&](auto idx) {
             // convert from SrcData to DstData here
             dst_thread_scratch_(idx) = type_convert<DstData>{}(src_thread_scratch_[idx]);
         });
+#else
+        if constexpr(SrcVectorDim == DstVectorDim)
+        {
+            static_ford<SliceLengths>{}([&](auto idx) {
+                // convert from SrcData to DstData here
+                dst_thread_scratch_(idx) = type_convert<DstData>{}(src_thread_scratch_[idx]);
+            });
+        }
+        else
+        {
+            // TODO type_convert is not used yet!!!!!
+            using src_vector_t = typename vector_type_maker_t<SrcData, SrcScalarPerVector>::type;
+            using dst_vector_t = typename vector_type_maker_t<DstData, DstScalarPerVector>::type;
+
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+            static_assert(SrcVectorDim != DstVectorDim, "wrong");
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                // get DstScalarPerVector # of src vectors from src_thread_scratch_
+                const auto src_vectors = generate_tuple(
+                    [&](auto i) {
+                        // i increment corresponds to movement in DstVectorDim
+                        return src_thread_scratch_.template GetAsType<src_vector_t>(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                StaticallyIndexedArray<dst_vector_t, num_dst_vector> dst_vectors;
+
+                // do data transpose
+                // TODO type_convert is not used yet!!!!!
+                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(src_vectors,
+                                                                                     dst_vectors);
+
+                // put SrcScalarPerVector # of dst vectors into dst_thread_scratch_
+                static_for<0, num_dst_vector, 1>{}([&](auto i) {
+                    // i increment corresponds to movement in DstVectorDim
+                    dst_thread_scratch_.template SetAsType<dst_vector_t>(
+                        data_idx_seq + i * src_scalar_step_in_vector, dst_vectors[i]);
+                });
+            });
+        }
+#endif
     }
 
     template <typename DstBuffer, typename DstStepHacks>
@@ -350,7 +453,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
             }
             ();
 
-            // move
+            // move dst coord
             static_for<0, nDim, 1>{}([&](auto i) {
                 if constexpr(move_on_dim[i])
                 {
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 7c9a6b60c00..4afdc7d788f 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -34,6 +34,7 @@
 #include "static_buffer_of_vector_type_v2.hpp"
 #include "dynamic_buffer.hpp"
 #include "is_known_at_compile_time.hpp"
+#include "transpose_vectors.hpp"
 
 #include "inner_product.hpp"
 
diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp
new file mode 100644
index 00000000000..ef4e10c49db
--- /dev/null
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
@@ -0,0 +1,60 @@
+#ifndef CK_TRANSPOSE_VECTORS_AMD_HPP
+#define CK_TRANSPOSE_VECTORS_AMD_HPP
+
+#include "config.hpp"
+#include "statically_indexed_array.hpp"
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename S,
+          index_t NX,
+          index_t NY,
+          typename enable_if<is_scalar_type<S>::value, bool>::type = false>
+struct transpose_vectors;
+
+// transpose fp16 2x2
+__device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+#if 1 // debug
+    const vector_type<half_t, 2> vx0{x0}, vx1{x1};
+    vector_type<half_t, 2> vy0, vy1;
+
+    vy0.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I0];
+    vy0.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I0];
+
+    vy1.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I1];
+    vy1.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I1];
+
+    y0 = vy0.template AsType<half2_t>()[I0];
+    y1 = vy1.template AsType<half2_t>()[I0];
+#endif
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<half_t, NX, NY>
+{
+    using X = typename vector_type<half_t, NY>::type;
+    using Y = typename vector_type<half_t, NX>::type;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __device__ void operator()(const StaticallyIndexedArray<X, NX>& xs,
+                               StaticallyIndexedArray<Y, NY>& ys)
+    {
+        // TODO make this generic for any NX, NY
+        static_assert((NX == 2 && NY == 2), "wrong!");
+
+        if constexpr(NX == 2 && NY == 2)
+        {
+            transpose_fp16_2x2(xs[I0], xs[I1], ys(I0), ys(I1));
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
index c44aa7d9a27..396a55caf89 100644
--- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
@@ -180,14 +180,14 @@ void device_gemm_xdlops_km_kn_mn(const Tensor<ABType>& a_k_m,
     using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
     using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
 
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 2;
 
     using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
     using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
 
     constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 2;
 
     constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0

From bc917d10f6cdde98d46b2500b0f945588fc07dca Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 18 Oct 2021 04:33:06 -0500
Subject: [PATCH 09/16] add inline asm for transpose 2x2 of half_t

---
 .../include/utility/transpose_vectors.hpp        | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp
index ef4e10c49db..03327e280a9 100644
--- a/composable_kernel/include/utility/transpose_vectors.hpp
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
@@ -16,10 +16,10 @@ struct transpose_vectors;
 // transpose fp16 2x2
 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
 {
+#if 0 // debug
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
-#if 1 // debug
     const vector_type<half_t, 2> vx0{x0}, vx1{x1};
     vector_type<half_t, 2> vy0, vy1;
 
@@ -31,14 +31,24 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
 
     y0 = vy0.template AsType<half2_t>()[I0];
     y1 = vy1.template AsType<half2_t>()[I0];
+#else
+    asm volatile("\n \
+            v_pack_b32_f16 %0, %2, %3 \n \
+            v_pack_b32_f16 %1, %2, %3, op_sel:[1, 1] \n \
+            "
+                 : "=v"(y0), "=v"(y1)
+                 : "v"(x0), "v"(x1), "0"(y0), "1"(y1));
 #endif
 }
 
 template <index_t NX, index_t NY>
 struct transpose_vectors<half_t, NX, NY>
 {
-    using X = typename vector_type<half_t, NY>::type;
-    using Y = typename vector_type<half_t, NX>::type;
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using X = typename vector_type<half_t, s_per_x>::type;
+    using Y = typename vector_type<half_t, s_per_y>::type;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};

From 6561289eab13bc550cdfcc8460d6e39734923241 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 18 Oct 2021 21:48:53 -0500
Subject: [PATCH 10/16] add general transpose_vectors(), but have unnecessary
 register initialization using v_mov

---
 .../tensor_description/static_tensor.hpp      |  4 ++
 .../include/utility/static_buffer.hpp         |  6 +++
 .../include/utility/transpose_vectors.hpp     | 47 ++++++++++++++++---
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp
index 9bf9b13daf9..86f3e3d1676 100644
--- a/composable_kernel/include/tensor_description/static_tensor.hpp
+++ b/composable_kernel/include/tensor_description/static_tensor.hpp
@@ -95,6 +95,8 @@ struct StaticTensorTupleOfVectorBuffer
     static constexpr index_t num_of_vector_ =
         math::integer_divide_ceil(element_space_size_, ScalarPerVector);
 
+    using V = vector_type<S, ScalarPerVector>;
+
     __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {}
 
     __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
@@ -103,6 +105,7 @@ struct StaticTensorTupleOfVectorBuffer
     }
 
     // read access of S
+    // Idx is for S, not V. Idx should be aligned with V
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
@@ -132,6 +135,7 @@ struct StaticTensorTupleOfVectorBuffer
     }
 
     // write access of S
+    // Idx is for S, not V. Idx should be aligned with V
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index 62cfb2f492c..02deb06d6d6 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -69,6 +69,7 @@ struct StaticBufferTupleOfVector
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 
     // read access of S
+    // i is offset of S
     template <index_t I>
     __host__ __device__ constexpr const S& operator[](Number<I> i) const
     {
@@ -79,6 +80,7 @@ struct StaticBufferTupleOfVector
     }
 
     // write access of S
+    // i is offset of S
     template <index_t I>
     __host__ __device__ constexpr S& operator()(Number<I> i)
     {
@@ -89,6 +91,7 @@ struct StaticBufferTupleOfVector
     }
 
     // read access of X
+    // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
               typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
@@ -97,6 +100,7 @@ struct StaticBufferTupleOfVector
         constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
 
         static_assert(s_per_v % s_per_x == 0, "wrong! V must  one or multiple X");
+        static_assert(i % s_per_x == 0, "wrong!");
 
         constexpr auto i_v = i / s_per_v;
         constexpr auto i_x = (i % s_per_v) / s_per_x;
@@ -105,6 +109,7 @@ struct StaticBufferTupleOfVector
     }
 
     // write access of X
+    // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
               typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
@@ -113,6 +118,7 @@ struct StaticBufferTupleOfVector
         constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
 
         static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X");
+        static_assert(i % s_per_x == 0, "wrong!");
 
         constexpr auto i_v = i / s_per_v;
         constexpr auto i_x = (i % s_per_v) / s_per_x;
diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp
index 03327e280a9..0b865f0ba66 100644
--- a/composable_kernel/include/utility/transpose_vectors.hpp
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
@@ -44,25 +44,58 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
 template <index_t NX, index_t NY>
 struct transpose_vectors<half_t, NX, NY>
 {
+    // we got [NY * NX] ammount of S data to be transposed
     static constexpr index_t s_per_x = NY;
     static constexpr index_t s_per_y = NX;
 
+    using S = half_t;
     using X = typename vector_type<half_t, s_per_x>::type;
     using Y = typename vector_type<half_t, s_per_y>::type;
 
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    __device__ void operator()(const StaticallyIndexedArray<X, NX>& xs,
-                               StaticallyIndexedArray<Y, NY>& ys)
+    __device__ void operator()(const StaticallyIndexedArray<X, NX>& x_tuple,
+                               StaticallyIndexedArray<Y, NY>& y_tuple)
     {
-        // TODO make this generic for any NX, NY
+        static constexpr auto I0 = Number<0>{};
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+
+#if 0
         static_assert((NX == 2 && NY == 2), "wrong!");
 
         if constexpr(NX == 2 && NY == 2)
         {
-            transpose_fp16_2x2(xs[I0], xs[I1], ys(I0), ys(I1));
+            transpose_fp16_2x2(x_tuple[I0], x_tuple[I1], y_tuple(I0), y_tuple(I1));
         }
+#else
+        static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!");
+
+        // create tuple of vector_type for holding data from x_tuple
+        const auto vx_tuple = generate_tuple(
+            [&](auto i) { return vector_type<S, s_per_x>{x_tuple[i]}; }, Number<NX>{});
+
+        // create tuple of vector_type to hold intermediate data for y_tuple
+        auto vy_tuple =
+            generate_tuple([&](auto) { return vector_type<S, s_per_y>{}; }, Number<NY>{});
+
+        // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 2>{}([&](auto iy) {
+            static_for<0, NX, 2>{}([&](auto ix) {
+                // reference to 2 half2_t data from vx_tuple
+                const auto& x_s2_0 = vx_tuple[ix].template AsType<half2_t>()[iy / I2];
+                const auto& x_s2_1 = vx_tuple[ix + I1].template AsType<half2_t>()[iy / I2];
+
+                // reference to 2 half2_t data from vy_tuple
+                auto& y_s2_0 = vy_tuple(iy).template AsType<half2_t>()(ix / I2);
+                auto& y_s2_1 = vy_tuple(iy + I1).template AsType<half2_t>()(ix / I2);
+
+                // transpose
+                transpose_fp16_2x2(x_s2_0, x_s2_1, y_s2_0, y_s2_1);
+            });
+        });
+
+        // copy data from vy_tuple into y_tuple
+        static_for<0, NY, 1>{}([&](auto i) { y_tuple(i) = vy_tuple[i].template AsType<Y>()[I0]; });
+#endif
     }
 };
 

From 9ad748713638df1aa9a5e6566794eb04472b1e6f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 19 Oct 2021 00:26:47 -0500
Subject: [PATCH 11/16] fix unnecessary register initialization in
 transpose_vector by using more pass-by-reference

---
 .../tensor_description/static_tensor.hpp      | 38 ++++++++++++++++---
 .../blockwise_tensor_slice_transfer.hpp       |  2 +-
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 36 +++++++++---------
 .../include/utility/static_buffer.hpp         | 32 ++++++++++++++--
 .../utility/statically_indexed_array.hpp      | 34 +++++++++++++----
 .../include/utility/transpose_vectors.hpp     | 31 +++------------
 composable_kernel/include/utility/tuple.hpp   |  7 ++++
 .../include/utility/tuple_helper.hpp          |  7 ++++
 .../include/device_gemm_xdlops_km_kn_mn.hpp   |  6 +--
 9 files changed, 128 insertions(+), 65 deletions(-)

diff --git a/composable_kernel/include/tensor_description/static_tensor.hpp b/composable_kernel/include/tensor_description/static_tensor.hpp
index 86f3e3d1676..e71980b8183 100644
--- a/composable_kernel/include/tensor_description/static_tensor.hpp
+++ b/composable_kernel/include/tensor_description/static_tensor.hpp
@@ -104,8 +104,8 @@ struct StaticTensorTupleOfVectorBuffer
     {
     }
 
-    // read access of S
-    // Idx is for S, not V. Idx should be aligned with V
+    // Get S
+    // Idx is for S, not V
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
@@ -134,8 +134,8 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
-    // write access of S
-    // Idx is for S, not V. Idx should be aligned with V
+    // Set S
+    // Idx is for S, not V
     template <typename Idx,
               typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
                                  bool>::type = false>
@@ -157,7 +157,8 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
-    // read access of X
+    // Get X
+    // Idx is for S, not X. Idx should be aligned with X
     template <typename X,
               typename Idx,
               typename enable_if<has_same_scalar_type<S, X>::value &&
@@ -190,7 +191,8 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
-    // write access of X
+    // Set X
+    // Idx is for S, not X. Idx should be aligned with X
     template <typename X,
               typename Idx,
               typename enable_if<has_same_scalar_type<S, X>::value &&
@@ -210,6 +212,30 @@ struct StaticTensorTupleOfVectorBuffer
         }
     }
 
+    // Get read access to V. No is_valid check
+    // Idx is for S, not V. Idx should be aligned with V
+    template <typename Idx>
+    __host__ __device__ constexpr const V& GetVectorTypeReference(Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        return data_.GetVectorTypeReference(Number<offset>{});
+    }
+
+    // Get read access to V. No is_valid check
+    // Idx is for S, not V. Idx should be aligned with V
+    template <typename Idx>
+    __host__ __device__ constexpr V& GetVectorTypeReference(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        return data_.GetVectorTypeReference(Number<offset>{});
+    }
+
     StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
     S invalid_element_value_ = S{0};
 };
diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index e0330853a74..f815a47f693 100644
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -5,7 +5,7 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-//#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
 #include "threadwise_tensor_slice_transfer_v3r2.hpp"
 
 namespace ck {
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index da1c2d3eece..d3a77482829 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -262,9 +262,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         }
         else
         {
-            // TODO type_convert is not used yet!!!!!
-            using src_vector_t = typename vector_type_maker_t<SrcData, SrcScalarPerVector>::type;
-            using dst_vector_t = typename vector_type_maker_t<DstData, DstScalarPerVector>::type;
 
             // each transpose does
             // DstScalarPerVector # of src vectors in src_thread_scratch_
@@ -297,28 +294,33 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                 constexpr auto data_idx_seq = generate_sequence_v2(
                     [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
 
-                // get DstScalarPerVector # of src vectors from src_thread_scratch_
-                const auto src_vectors = generate_tuple(
-                    [&](auto i) {
+                // TODO type_convert is not used yet!!!!!
+                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
                         // i increment corresponds to movement in DstVectorDim
-                        return src_thread_scratch_.template GetAsType<src_vector_t>(
+                        return src_thread_scratch_.GetVectorTypeReference(
                             data_idx_seq + i * dst_scalar_step_in_vector);
                     },
                     Number<num_src_vector>{});
 
-                StaticallyIndexedArray<dst_vector_t, num_dst_vector> dst_vectors;
+                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
 
                 // do data transpose
                 // TODO type_convert is not used yet!!!!!
-                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(src_vectors,
-                                                                                     dst_vectors);
-
-                // put SrcScalarPerVector # of dst vectors into dst_thread_scratch_
-                static_for<0, num_dst_vector, 1>{}([&](auto i) {
-                    // i increment corresponds to movement in DstVectorDim
-                    dst_thread_scratch_.template SetAsType<dst_vector_t>(
-                        data_idx_seq + i * src_scalar_step_in_vector, dst_vectors[i]);
-                });
+                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
             });
         }
 #endif
diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp
index 02deb06d6d6..1deb0780252 100644
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -68,7 +68,7 @@ struct StaticBufferTupleOfVector
 
     __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 
-    // read access of S
+    // Get S
     // i is offset of S
     template <index_t I>
     __host__ __device__ constexpr const S& operator[](Number<I> i) const
@@ -79,7 +79,7 @@ struct StaticBufferTupleOfVector
         return base::operator[](i_v).template AsType<S>()[i_s];
     }
 
-    // write access of S
+    // Set S
     // i is offset of S
     template <index_t I>
     __host__ __device__ constexpr S& operator()(Number<I> i)
@@ -90,7 +90,7 @@ struct StaticBufferTupleOfVector
         return base::operator()(i_v).template AsType<S>()(i_s);
     }
 
-    // read access of X
+    // Get X
     // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
@@ -108,7 +108,7 @@ struct StaticBufferTupleOfVector
         return base::operator[](i_v).template AsType<X>()[i_x];
     }
 
-    // write access of X
+    // Set X
     // i is offset of S, not X. i should be aligned to X
     template <typename X,
               index_t I,
@@ -125,6 +125,30 @@ struct StaticBufferTupleOfVector
 
         base::operator()(i_v).template AsType<X>()(i_x) = x;
     }
+
+    // Get read access to vector_type V
+    // i is offset of S, not V. i should be aligned to V
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetVectorTypeReference(Number<I> i) const
+    {
+        static_assert(i % s_per_v == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+
+        return base::operator[](i_v);
+    }
+
+    // Get write access to vector_type V
+    // i is offset of S, not V. i should be aligned to V
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetVectorTypeReference(Number<I> i)
+    {
+        static_assert(i % s_per_v == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+
+        return base::operator()(i_v);
+    }
 };
 
 template <AddressSpaceEnum_t AddressSpace, typename T, index_t N>
diff --git a/composable_kernel/include/utility/statically_indexed_array.hpp b/composable_kernel/include/utility/statically_indexed_array.hpp
index f30a3a9ee63..372751faf16 100644
--- a/composable_kernel/include/utility/statically_indexed_array.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
@@ -8,20 +8,38 @@
 namespace ck {
 
 namespace detail {
+template <typename X, typename Y>
+struct tuple_concat;
 
-template <typename T, index_t NSize>
-__host__ __device__ constexpr auto generate_same_type_tuple()
+template <typename... Xs, typename... Ys>
+struct tuple_concat<Tuple<Xs...>, Tuple<Ys...>>
 {
-    return generate_tuple([](auto) -> T { return T{}; }, Number<NSize>{});
-}
+    using type = Tuple<Xs..., Ys...>;
+};
 
-template <typename T, index_t NSize>
-using same_type_tuple = decltype(generate_same_type_tuple<T, NSize>());
+template <typename T, index_t N>
+struct StaticallyIndexedArrayImpl
+{
+    using type =
+        typename tuple_concat<typename StaticallyIndexedArrayImpl<T, N / 2>::type,
+                              typename StaticallyIndexedArrayImpl<T, N - N / 2>::type>::type;
+};
 
+template <typename T>
+struct StaticallyIndexedArrayImpl<T, 0>
+{
+    using type = Tuple<>;
+};
+
+template <typename T>
+struct StaticallyIndexedArrayImpl<T, 1>
+{
+    using type = Tuple<T>;
+};
 } // namespace detail
 
-template <typename T, index_t NSize>
-using StaticallyIndexedArray = detail::same_type_tuple<T, NSize>;
+template <typename T, index_t N>
+using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl<T, N>::type;
 
 template <typename X, typename... Xs>
 __host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp
index 0b865f0ba66..a000c14e06c 100644
--- a/composable_kernel/include/utility/transpose_vectors.hpp
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
@@ -48,35 +48,18 @@ struct transpose_vectors<half_t, NX, NY>
     static constexpr index_t s_per_x = NY;
     static constexpr index_t s_per_y = NX;
 
-    using S = half_t;
-    using X = typename vector_type<half_t, s_per_x>::type;
-    using Y = typename vector_type<half_t, s_per_y>::type;
+    using S  = half_t;
+    using VX = vector_type<half_t, s_per_x>;
+    using VY = vector_type<half_t, s_per_y>;
 
-    __device__ void operator()(const StaticallyIndexedArray<X, NX>& x_tuple,
-                               StaticallyIndexedArray<Y, NY>& y_tuple)
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
     {
-        static constexpr auto I0 = Number<0>{};
         static constexpr auto I1 = Number<1>{};
         static constexpr auto I2 = Number<2>{};
 
-#if 0
-        static_assert((NX == 2 && NY == 2), "wrong!");
-
-        if constexpr(NX == 2 && NY == 2)
-        {
-            transpose_fp16_2x2(x_tuple[I0], x_tuple[I1], y_tuple(I0), y_tuple(I1));
-        }
-#else
         static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!");
 
-        // create tuple of vector_type for holding data from x_tuple
-        const auto vx_tuple = generate_tuple(
-            [&](auto i) { return vector_type<S, s_per_x>{x_tuple[i]}; }, Number<NX>{});
-
-        // create tuple of vector_type to hold intermediate data for y_tuple
-        auto vy_tuple =
-            generate_tuple([&](auto) { return vector_type<S, s_per_y>{}; }, Number<NY>{});
-
         // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple
         static_for<0, NY, 2>{}([&](auto iy) {
             static_for<0, NX, 2>{}([&](auto ix) {
@@ -92,10 +75,6 @@ struct transpose_vectors<half_t, NX, NY>
                 transpose_fp16_2x2(x_s2_0, x_s2_1, y_s2_0, y_s2_1);
             });
         });
-
-        // copy data from vy_tuple into y_tuple
-        static_for<0, NY, 1>{}([&](auto i) { y_tuple(i) = vy_tuple[i].template AsType<Y>()[I0]; });
-#endif
     }
 };
 
diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp
index 3dea944f6d1..96cab4b99ee 100644
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -166,5 +166,12 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
     return Tuple<remove_cvref_t<Xs>...>(std::forward<Xs>(xs)...);
 }
 
+// https://en.cppreference.com/w/cpp/utility/tuple/tie
+template <typename... Args>
+constexpr Tuple<Args&...> tie(Args&... args) noexcept
+{
+    return {args...};
+}
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp
index f568a9d824e..4e5b9cf97c8 100644
--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
@@ -13,6 +13,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
                   typename arithmetic_sequence_gen<0, N, 1>::type{});
 }
 
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return tie(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
 namespace detail {
 
 template <typename F, typename X, index_t... Is>
diff --git a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
index 396a55caf89..c44aa7d9a27 100644
--- a/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
+++ b/host/driver_offline/include/device_gemm_xdlops_km_kn_mn.hpp
@@ -180,14 +180,14 @@ void device_gemm_xdlops_km_kn_mn(const Tensor<ABType>& a_k_m,
     using ABlockTransferThreadSliceLengths_K0_M_K1   = Sequence<1, 4, 8>;
     using ABlockTransferThreadClusterLengths_K0_M_K1 = Sequence<4, 64, 1>;
 
-    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 2;
-    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 2;
+    constexpr index_t ABlockTransferSrcScalarPerVector_M  = 4;
+    constexpr index_t ABlockTransferDstScalarPerVector_K1 = 8;
 
     using BBlockTransferThreadSliceLengths_K0_N_K1   = Sequence<1, 2, 8>;
     using BBlockTransferThreadClusterLengths_K0_N_K1 = Sequence<4, 64, 1>;
 
     constexpr index_t BBlockTransferSrcScalarPerVector_N  = 2;
-    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 2;
+    constexpr index_t BBlockTransferDstScalarPerVector_K1 = 8;
 
     constexpr index_t CThreadTransferDstScalarPerVector = 1;
 #elif 0

From 405e74a052d1b13de6169834b7daa8beb6b62d3d Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 19 Oct 2021 20:05:32 -0500
Subject: [PATCH 12/16] add hardcoded logic for NHWC wrw

---
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 37 +++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index d3a77482829..c776387970e 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -687,6 +687,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
 
         constexpr auto src_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
@@ -699,7 +700,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto desc0 =
             make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
 
-        // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight
+#if 0
+        // TODO this is hardcoded for GEMM TN layout
         // TODO implemenet the general logic
         constexpr auto desc1 = transform_tensor_descriptor(
             desc0,
@@ -710,6 +712,20 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                        make_pass_through_transform(src_access_lengths_and_vector_length[I2])),
             make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+        // TODO this is hardcoded for NHWC backward-weight kernel
+        // TODO implemenet the general logic
+        constexpr auto desc1 = transform_tensor_descriptor(
+            desc0,
+            make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]),
+                       make_pass_through_transform(src_access_lengths_and_vector_length[I1]),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(src_access_lengths_and_vector_length[I2],
+                                      src_access_lengths_and_vector_length[I4])),
+                       make_pass_through_transform(src_access_lengths_and_vector_length[I3])),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+#endif
 
         return desc1;
     }
@@ -720,6 +736,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
+        constexpr auto I4 = Number<4>{};
 
         constexpr auto dst_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
@@ -731,8 +748,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
         constexpr auto desc0 =
             make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
-
-        // TODO this is hardcoded for GEMM TN layout, it also works for NHWC backward-weight
+#if 0
+        // TODO this is hardcoded for GEMM TN layout
         // TODO implemenet the general logic
         constexpr auto desc1 = transform_tensor_descriptor(
             desc0,
@@ -743,6 +760,20 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                                       dst_access_lengths_and_vector_length[I3]))),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+        // TODO this is hardcoded for NHWC backward-weight kernel
+        // TODO implemenet the general logic
+        constexpr auto desc1 = transform_tensor_descriptor(
+            desc0,
+            make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]),
+                       make_pass_through_transform(dst_access_lengths_and_vector_length[I1]),
+                       make_pass_through_transform(dst_access_lengths_and_vector_length[I2]),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(dst_access_lengths_and_vector_length[I3],
+                                      dst_access_lengths_and_vector_length[I4]))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+#endif
 
         return desc1;
     }

From 59314e4c60b03a1190c8ee69243949ce97dd35a2 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 19 Oct 2021 21:48:28 -0500
Subject: [PATCH 13/16] improve asm for v_pack

---
 .../threadwise_tensor_slice_transfer_v3r2.hpp     | 15 +++------------
 .../include/utility/transpose_vectors.hpp         | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index c776387970e..60d24857e70 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -185,7 +185,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                        src_scalar_per_access;
             }();
 
-#if 1 // debug
             constexpr auto src_data_idx_seq = generate_sequence_v2(
                 [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
 
@@ -198,7 +197,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
             src_thread_scratch_.template SetAsType<src_vector_t>(
                 src_data_idx_seq,
                 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid));
-#endif
 
             constexpr auto move_on_dim = [&]() constexpr
             {
@@ -330,11 +328,9 @@ struct ThreadwiseTensorSliceTransfer_v3r2
     __device__ void
     RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
     {
-#if 1 // debug
-      // if there is transpose, it's done here
-      // TODO move this elsewhere
+        // if there is transpose, it's done here
+        // TODO move this elsewhere
         TransferDataFromSrcThreadScratchToDstThreadScratch();
-#endif
 
         static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
                           DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
@@ -422,7 +418,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                        dst_scalar_per_access;
             }();
 
-#if 1 // debug
             constexpr auto dst_data_idx_seq = generate_sequence_v2(
                 [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
 
@@ -436,7 +431,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                 dst_coord_.GetOffset(),
                 is_dst_valid,
                 dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq));
-#endif
 
             constexpr auto move_on_dim = [&]() constexpr
             {
@@ -680,7 +674,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
     }
 
-#if 1 // debug
     __device__ static constexpr auto GetSrcThreadScratchDescriptor()
     {
         constexpr auto I0 = Number<0>{};
@@ -748,6 +741,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
         constexpr auto desc0 =
             make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
 #if 0
         // TODO this is hardcoded for GEMM TN layout
         // TODO implemenet the general logic
@@ -777,10 +771,8 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
         return desc1;
     }
-#endif
 
     private:
-#if 1 // debug
     static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
     static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
 
@@ -797,7 +789,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                                     decltype(dst_thread_scratch_desc_),
                                     true>
         dst_thread_scratch_;
-#endif
 
     SrcCoord src_coord_;
     DstCoord dst_coord_;
diff --git a/composable_kernel/include/utility/transpose_vectors.hpp b/composable_kernel/include/utility/transpose_vectors.hpp
index a000c14e06c..866241a9479 100644
--- a/composable_kernel/include/utility/transpose_vectors.hpp
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
@@ -16,7 +16,7 @@ struct transpose_vectors;
 // transpose fp16 2x2
 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
 {
-#if 0 // debug
+#if 0
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
@@ -33,11 +33,16 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
     y1 = vy1.template AsType<half2_t>()[I0];
 #else
     asm volatile("\n \
-            v_pack_b32_f16 %0, %2, %3 \n \
-            v_pack_b32_f16 %1, %2, %3, op_sel:[1, 1] \n \
+            v_pack_b32_f16 %0, %1, %2 \n \
             "
-                 : "=v"(y0), "=v"(y1)
-                 : "v"(x0), "v"(x1), "0"(y0), "1"(y1));
+                 : "=v"(y0)
+                 : "v"(x0), "v"(x1));
+
+    asm volatile("\n \
+            v_pack_b32_f16 %0, %1, %2, op_sel:[1, 1] \n \
+            "
+                 : "=v"(y1)
+                 : "v"(x0), "v"(x1));
 #endif
 }
 

From 310f9ee8428c145683df9375a81f24d81c8e95b0 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Tue, 19 Oct 2021 22:44:50 -0500
Subject: [PATCH 14/16] make ThreadwiseTensorSliceTransfer_v3r2 support any
 tensor

---
 .../blockwise_tensor_slice_transfer.hpp       |   1 -
 .../threadwise_tensor_slice_transfer_v3r2.hpp | 143 +++++++++---------
 .../src/gemm_driver_offline.cpp               |   6 +-
 3 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
index f815a47f693..d03bda8fd92 100644
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp
@@ -5,7 +5,6 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "cluster_descriptor.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
 #include "threadwise_tensor_slice_transfer_v3r2.hpp"
 
 namespace ck {
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index 60d24857e70..c906e3bf9e1 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -245,12 +245,10 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
     __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
     {
-#if 0 // debug
-        static_ford<SliceLengths>{}([&](auto idx) {
-            // convert from SrcData to DstData here
-            dst_thread_scratch_(idx) = type_convert<DstData>{}(src_thread_scratch_[idx]);
-        });
-#else
+        // TODO make this logic more generic
+        // TODO missing type_convert !!!!!!!!!!!!!
+
+        //
         if constexpr(SrcVectorDim == DstVectorDim)
         {
             static_ford<SliceLengths>{}([&](auto idx) {
@@ -321,7 +319,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                     src_vector_refs, dst_vector_refs);
             });
         }
-#endif
     }
 
     template <typename DstBuffer, typename DstStepHacks>
@@ -676,12 +673,6 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
     __device__ static constexpr auto GetSrcThreadScratchDescriptor()
     {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
         constexpr auto src_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
 
@@ -690,47 +681,48 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto src_access_lengths_and_vector_length = container_push_back(
             sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
 
+        // 1st stage of transforms
         constexpr auto desc0 =
             make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
 
-#if 0
-        // TODO this is hardcoded for GEMM TN layout
-        // TODO implemenet the general logic
-        constexpr auto desc1 = transform_tensor_descriptor(
-            desc0,
-            make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(src_access_lengths_and_vector_length[I1],
-                                      src_access_lengths_and_vector_length[I3])),
-                       make_pass_through_transform(src_access_lengths_and_vector_length[I2])),
-            make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-        // TODO this is hardcoded for NHWC backward-weight kernel
-        // TODO implemenet the general logic
-        constexpr auto desc1 = transform_tensor_descriptor(
-            desc0,
-            make_tuple(make_pass_through_transform(src_access_lengths_and_vector_length[I0]),
-                       make_pass_through_transform(src_access_lengths_and_vector_length[I1]),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(src_access_lengths_and_vector_length[I2],
-                                      src_access_lengths_and_vector_length[I4])),
-                       make_pass_through_transform(src_access_lengths_and_vector_length[I3])),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 4>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-#endif
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
 
-        return desc1;
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
     }
 
     __device__ static constexpr auto GetDstThreadScratchDescriptor()
     {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
+        // 1st stage of transforms
         constexpr auto dst_scalar_per_access = generate_sequence(
             detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
 
@@ -742,34 +734,39 @@ struct ThreadwiseTensorSliceTransfer_v3r2
         constexpr auto desc0 =
             make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
 
-#if 0
-        // TODO this is hardcoded for GEMM TN layout
-        // TODO implemenet the general logic
-        constexpr auto desc1 = transform_tensor_descriptor(
-            desc0,
-            make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]),
-                       make_pass_through_transform(dst_access_lengths_and_vector_length[I1]),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(dst_access_lengths_and_vector_length[I2],
-                                      dst_access_lengths_and_vector_length[I3]))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-#else
-        // TODO this is hardcoded for NHWC backward-weight kernel
-        // TODO implemenet the general logic
-        constexpr auto desc1 = transform_tensor_descriptor(
-            desc0,
-            make_tuple(make_pass_through_transform(dst_access_lengths_and_vector_length[I0]),
-                       make_pass_through_transform(dst_access_lengths_and_vector_length[I1]),
-                       make_pass_through_transform(dst_access_lengths_and_vector_length[I2]),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(dst_access_lengths_and_vector_length[I3],
-                                      dst_access_lengths_and_vector_length[I4]))),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-#endif
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
 
-        return desc1;
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
     }
 
     private:
diff --git a/host/driver_offline/src/gemm_driver_offline.cpp b/host/driver_offline/src/gemm_driver_offline.cpp
index fec8170422c..e60b4905ae7 100644
--- a/host/driver_offline/src/gemm_driver_offline.cpp
+++ b/host/driver_offline/src/gemm_driver_offline.cpp
@@ -22,10 +22,10 @@
 #include "device_gemm_xdlops_km_kn_nm.hpp"
 #include "device_gemm_xdlops_km_nk_nm.hpp"
 
-#define USE_GEMM_XDL_MK_KN_MN 0
-#define USE_GEMM_XDL_MK_NK_MN 0
+#define USE_GEMM_XDL_MK_KN_MN 1
+#define USE_GEMM_XDL_MK_NK_MN 1
 #define USE_GEMM_XDL_KM_KN_MN 1
-#define USE_GEMM_XDL_KM_NK_MN 0
+#define USE_GEMM_XDL_KM_NK_MN 1
 #define USE_GEMM_XDL_MK_KN_NM 0
 #define USE_GEMM_XDL_MK_NK_NM 0
 #define USE_GEMM_XDL_KM_KN_NM 0

From 0112c76cc6c369aac4d9472a259546d5fcb6a2fe Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 20 Oct 2021 00:26:53 -0500
Subject: [PATCH 15/16] tweak

---
 .../threadwise_tensor_slice_transfer_v3r2.hpp              | 7 +++++++
 ...data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
index c906e3bf9e1..13214829f4e 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r2.hpp
@@ -245,6 +245,12 @@ struct ThreadwiseTensorSliceTransfer_v3r2
 
     __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
     {
+#if 0 // debug
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // convert from SrcData to DstData here
+            dst_thread_scratch_(idx) = type_convert<DstData>{}(src_thread_scratch_[idx]);
+        });
+#else
         // TODO make this logic more generic
         // TODO missing type_convert !!!!!!!!!!!!!
 
@@ -319,6 +325,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
                     src_vector_refs, dst_vector_refs);
             });
         }
+#endif
     }
 
     template <typename DstBuffer, typename DstStepHacks>
diff --git a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
index d6955ec0005..e58fb08914c 100644
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
@@ -104,7 +104,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 1
+#elif 0
     // [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
     constexpr index_t BlockSize = 256;
 
@@ -132,7 +132,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
 
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
-#elif 0
+#elif 1
     // [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
     constexpr index_t BlockSize = 256;
 

From b684ae4ebdfdc46afc56bed1173a85930e9bc765 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 15 Nov 2021 09:22:22 -0600
Subject: [PATCH 16/16] reorganize file

---
 .../{tensor_description => utility}/is_known_at_compile_time.hpp  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename composable_kernel/include/{tensor_description => utility}/is_known_at_compile_time.hpp (100%)

diff --git a/composable_kernel/include/tensor_description/is_known_at_compile_time.hpp b/composable_kernel/include/utility/is_known_at_compile_time.hpp
similarity index 100%
rename from composable_kernel/include/tensor_description/is_known_at_compile_time.hpp
rename to composable_kernel/include/utility/is_known_at_compile_time.hpp