Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
08c9433
fix relu
Dec 4, 2021
e041175
clean up
Dec 4, 2021
037a578
clean up
Dec 4, 2021
6729957
adding 1x1 conv
Dec 5, 2021
a86da8e
adding 1x1 conv
Dec 5, 2021
aa0a891
added 1x1 conv
Dec 5, 2021
1f8e823
refactor
Dec 6, 2021
c345719
refactor
Dec 6, 2021
cd92911
refactor
Dec 6, 2021
96d1a7a
added profiler for conv+bias+relu+add
Dec 6, 2021
19d19d3
Merge remote-tracking branch 'origin/develop' into tweak
Dec 6, 2021
63bca51
clean up
Dec 6, 2021
29c6b47
adding conv+bias+relu
Dec 6, 2021
8159be3
adding conv+bias+relu
Dec 6, 2021
a84f254
added conv+bias+relu
Dec 6, 2021
e0e1714
Update README.md
Dec 6, 2021
9b363cf
Merge branch 'tweak' of github.com:ROCmSoftwarePlatform/composable_ke…
Dec 6, 2021
8c85a3e
update cpu verification
Dec 11, 2021
03059eb
adding c shuffle
Dec 12, 2021
2fd5e6a
Merge remote-tracking branch 'origin/develop' into tweak
Dec 13, 2021
1b15b21
update static_tensor for dealing with invalid element
Dec 13, 2021
7b5b1c8
adding c shuffle
Dec 13, 2021
b6950a3
debugging
Dec 14, 2021
aa43225
fix bug
Dec 15, 2021
26ce5e1
convert to fp16 before shuffle
Dec 15, 2021
50d7b4f
shuffle more than one M/NRepeat
Dec 15, 2021
0af9345
clean up
Dec 15, 2021
b3ab0e1
remove coordinate step hack from GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
Dec 15, 2021
619661f
clean up
Dec 15, 2021
491437a
remove coordinate step hack from all gridwise gemm xdl
Dec 15, 2021
69e771f
clean up coordinate step hack
Dec 15, 2021
583aab0
clean up coordinate step hack
Dec 15, 2021
bc6513a
ThreadwiseTensorSliceTransfer_v3r2 support pointwise op on both src a…
Dec 15, 2021
847359c
adding output shuffle in conv+bias+relu+add
Dec 16, 2021
ddc1fd4
update
Dec 18, 2021
3092477
added conv+bias+relu+add with c shuffle
Dec 19, 2021
4c4b7cb
added conv+bias+relu+add with c shuffle
Dec 19, 2021
edb1d2c
fix forward_sweep bugs in threadwise copy
Dec 19, 2021
681ede9
clean up
Dec 19, 2021
925a8d7
refactor
Dec 19, 2021
2c7ccf6
clean up
Dec 19, 2021
adbda38
clean up
Dec 20, 2021
8169b04
added conv_c_shuffle+bias_relu
Dec 20, 2021
7dba659
clean up
Dec 20, 2021
d3bd592
added conv+bias+relu+atomic_add
Dec 20, 2021
3597833
clean up
Dec 22, 2021
8767acb
clean up
Dec 22, 2021
f5e64f1
clean up
Dec 22, 2021
0a36976
clean up
Dec 22, 2021
a58b2b7
clean up
Dec 22, 2021
b8aeb85
clean up
Dec 22, 2021
6f341d8
misc fixes; add 1x1 specialization
Dec 23, 2021
09cdfef
clean up
Dec 23, 2021
6bf3cf4
delete unused device op
Dec 23, 2021
3552bca
clean up
Dec 23, 2021
2da57fc
add support for odd C value
Dec 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 20 additions & 15 deletions composable_kernel/include/tensor_description/static_tensor.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#ifndef CK_STATIC_TENSOR_HPP
#define CK_STATIC_TENSOR_HPP

#include "ignore.hpp"

namespace ck {

// StaticTensor for Scalar
Expand All @@ -17,10 +15,10 @@ struct StaticTensor
static constexpr index_t ndim_ = TensorDesc::GetNumOfDimension();
static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize();

__host__ __device__ constexpr StaticTensor() : invalid_element_value_{0} {}
__host__ __device__ constexpr StaticTensor() : invalid_element_scalar_value_{0} {}

__host__ __device__ constexpr StaticTensor(T invalid_element_value)
: invalid_element_value_{invalid_element_value}
: invalid_element_scalar_value_{invalid_element_value}
{
}

Expand All @@ -44,11 +42,11 @@ struct StaticTensor
{
if constexpr(InvalidElementUseNumericalZeroValue)
{
return T{0};
return zero_scalar_value_;
}
else
{
return invalid_element_value_;
return invalid_element_scalar_value_;
}
}
}
Expand All @@ -71,12 +69,14 @@ struct StaticTensor
}
else
{
return ignore;
return ignored_element_scalar_;
}
}

StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
T invalid_element_value_ = T{0};
static constexpr T zero_scalar_value_ = T{0};
const T invalid_element_scalar_value_;
T ignored_element_scalar_;
};

// StaticTensor for vector
Expand All @@ -97,10 +97,13 @@ struct StaticTensorTupleOfVectorBuffer

using V = vector_type<S, ScalarPerVector>;

__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() : invalid_element_value_{0} {}
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
: invalid_element_scalar_value_{0}
{
}

__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
: invalid_element_value_{invalid_element_value}
: invalid_element_scalar_value_{invalid_element_value}
{
}

Expand All @@ -125,11 +128,11 @@ struct StaticTensorTupleOfVectorBuffer
{
if constexpr(InvalidElementUseNumericalZeroValue)
{
return S{0};
return zero_scalar_value_;
}
else
{
return invalid_element_value_;
return invalid_element_scalar_value_;
}
}
}
Expand All @@ -153,7 +156,7 @@ struct StaticTensorTupleOfVectorBuffer
}
else
{
return ignore;
return ignored_element_scalar_;
}
}

Expand Down Expand Up @@ -186,7 +189,7 @@ struct StaticTensorTupleOfVectorBuffer
else
{
// TODO: is this right way to initialize a vector?
return X{invalid_element_value_};
return X{invalid_element_scalar_value_};
}
}
}
Expand Down Expand Up @@ -237,7 +240,9 @@ struct StaticTensorTupleOfVectorBuffer
}

StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
S invalid_element_value_ = S{0};
static constexpr S zero_scalar_value_ = S{0};
const S invalid_element_scalar_value_ = S{0};
S ignored_element_scalar_;
};

template <AddressSpaceEnum_t AddressSpace,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP

#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v3r2.hpp"
#include "threadwise_tensor_slice_transfer_v3r1.hpp"

namespace ck {

Expand All @@ -15,9 +15,9 @@ namespace ck {
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template <index_t BlockSize,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,
InMemoryDataOperationEnum_t DstInMemOp,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcData,
Expand All @@ -34,35 +34,38 @@ template <index_t BlockSize,
index_t DstScalarStrideInVector,
bool ThreadTransferSrcResetCoordinateAfterRun,
bool ThreadTransferDstResetCoordinateAfterRun>
struct BlockwiseTensorSliceTransfer_v4
struct BlockwiseTensorSliceTransfer_v4r1
{
static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();

static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};

using Index = MultiIndex<nDim>;

__device__ constexpr BlockwiseTensorSliceTransfer_v4(
__device__ constexpr BlockwiseTensorSliceTransfer_v4r1(
const SrcDesc& src_desc,
const Index& src_block_slice_origin,
const SrcElementwiseOperation& src_element_op,
const DstDesc& dst_desc,
const Index& dst_block_slice_origin,
const SrcElementwiseOperation& src_element_op)
const DstElementwiseOperation& dst_element_op)
: threadwise_transfer_(src_desc,
make_zero_multi_index<nDim>(),
src_element_op,
dst_desc,
make_zero_multi_index<nDim>(),
src_element_op)
dst_element_op)

{
static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");

static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
is_same<BlockSliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");

static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
Expand All @@ -74,7 +77,7 @@ struct BlockwiseTensorSliceTransfer_v4
const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
make_multi_index(get_thread_local_1d_id()));

const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;

threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_idx_begin);
Expand Down Expand Up @@ -114,6 +117,16 @@ struct BlockwiseTensorSliceTransfer_v4
}
}

template <typename SrcBuffer, typename DstBuffer>
__device__ void Run(const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
RunRead(src_desc, src_buf);
RunWrite(dst_desc, dst_buf);
}

__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
Expand Down Expand Up @@ -152,8 +165,9 @@ struct BlockwiseTensorSliceTransfer_v4
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3r2<ThreadSliceLengths,
ThreadwiseTensorSliceTransfer_v3r1<decltype(thread_slice_lengths),
SrcElementwiseOperation,
DstElementwiseOperation,
DstInMemOp,
SrcData,
DstData,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP

#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v2.hpp"
#include "threadwise_tensor_slice_transfer_v5r1.hpp"

namespace ck {

Expand All @@ -31,13 +31,13 @@ template <index_t BlockSize,
typename DstVectorTensorContiguousDimOrder,
bool ThreadTransferSrcResetCoordinateAfterRun,
bool ThreadTransferDstResetCoordinateAfterRun>
struct BlockwiseTensorSliceTransfer_v4r1
struct BlockwiseTensorSliceTransfer_v5r1
{
static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();

using Index = MultiIndex<nDim>;

__device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc,
__device__ constexpr BlockwiseTensorSliceTransfer_v5r1(const SrcDesc& src_desc,
const Index& src_block_slice_origin,
const DstDesc& dst_desc,
const Index& dst_block_slice_origin)
Expand Down Expand Up @@ -134,7 +134,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
ThreadwiseTensorSliceTransfer_v5r1<ThreadSliceLengths,
DstInMemOp,
SrcData,
DstData,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP

#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_tensor_slice_transfer_v6r1.hpp"

namespace ck {

// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template <index_t BlockSize,
typename ElementwiseOperation,
InMemoryDataOperationEnum_t DstInMemOp,
typename BlockSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename DimAccessOrder,
index_t VectorDim,
index_t ScalarPerVector,
bool ThreadTransferSrcResetCoordinateAfterRun,
bool ThreadTransferDstResetCoordinateAfterRun>
struct BlockwiseTensorSliceTransfer_v6r1
{
static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();

static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};

using Index = MultiIndex<nDim>;

__device__ constexpr BlockwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
const Index& src_block_slice_origin,
const DstDesc& dst_desc,
const Index& dst_block_slice_origin,
const ElementwiseOperation& element_op)
: threadwise_transfer_(src_desc,
make_zero_multi_index<nDim>(),
dst_desc,
make_zero_multi_index<nDim>(),
element_op)

{
static_assert(nDim == remove_reference_t<remove_cv_t<SrcDesc>>::GetNumOfDimension() &&
nDim == remove_reference_t<remove_cv_t<DstDesc>>::GetNumOfDimension() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == DimAccessOrder::Size(),
"wrong! nDim not consistent");

static_assert(
is_same<BlockSliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");

static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
"wrong! BlockSize too small");

if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
make_multi_index(get_thread_local_1d_id()));

const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;

threadwise_transfer_.SetSrcSliceOrigin(src_desc,
src_block_slice_origin + thread_data_idx_begin);
threadwise_transfer_.SetDstSliceOrigin(dst_desc,
dst_block_slice_origin + thread_data_idx_begin);
}
}

template <typename SrcBuffer, typename DstBuffer>
__device__ void Run(const SrcDesc& src_desc,
const SrcBuffer& src_buf,
const DstDesc& dst_desc,
DstBuffer& dst_buf)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.Run(src_desc, src_buf, dst_desc, dst_buf);
}
}

__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
}
}

__device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
}
}

private:
static constexpr auto thread_cluster_desc_ =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

using ThreadwiseTransfer =
ThreadwiseTensorSliceTransfer_v6r1<SrcData,
DstData,
SrcDesc,
DstDesc,
ElementwiseOperation,
decltype(thread_slice_lengths),
DimAccessOrder,
VectorDim,
ScalarPerVector,
DstInMemOp,
ThreadTransferSrcResetCoordinateAfterRun,
ThreadTransferDstResetCoordinateAfterRun>;

ThreadwiseTransfer threadwise_transfer_;
};

} // namespace ck
#endif
Loading