Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
bbcecd5
Merge branch 'develop' into generic_instance
zjing14 May 30, 2023
240900b
Add NumReduceDim template parameter to DeviceSoftmax and Softmax clie…
qianfengz May 30, 2023
de0bb3c
Move the generic kernel instance to be the first of the instance list…
qianfengz May 30, 2023
a9f0d00
Add GetGenericInstance() interface for DeviceOperationInstanceFactory…
qianfengz May 31, 2023
f629cd9
Add testing of GetGenericInstance() in client_example of Softmax
qianfengz May 31, 2023
e76393c
Merge branch 'generic_instance' of https://github.com/ROCmSoftwarePla…
qianfengz May 31, 2023
110f517
Merge branch 'develop' into generic_instance
qianfengz Jun 1, 2023
49a1c09
Revert "Add testing of GetGenericInstance() in client_example of Soft…
qianfengz Jun 5, 2023
3c0e60b
Revert "Add GetGenericInstance() interface for DeviceOperationInstanc…
qianfengz Jun 5, 2023
43eb43b
Support generic kernel instance to be the first instance returned by …
qianfengz Jun 6, 2023
ab11f54
Move generic kernel instance to separate tuple for elementwise op of …
qianfengz Jun 7, 2023
2b721d8
Remove un-used files for softmax instance
qianfengz Jun 7, 2023
53a0320
Store generic kernel instance to separate tuple for softmax
qianfengz Jun 7, 2023
0fbff90
Add IsSupported checking for generic instance to client example of so…
qianfengz Jun 7, 2023
559b02b
Replace the get_device_normalize_from_mean_meansquare_instances() by …
qianfengz Jun 7, 2023
20f7c63
clang-format fix
qianfengz Jun 8, 2023
e0e2207
Remove int8 from softmax instances
qianfengz Jun 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -172,18 +172,19 @@ int main()
BLayout,
CLayout>();

const auto normalize_ptrs =
ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
CDataType,
ReduceDataType,
ReduceDataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>();

std::cout << "found " << gemm_reduce_ptrs.size()
<< " gemm_reduceMean_reduceSquareMean instances" << std::endl;

using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
ck::Tuple<LayerNormOutDataType>,
ck::tensor_operation::element_wise::Normalize,
2>;

const auto normalize_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
NormalizeDeviceOp>::GetInstances();

std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;

auto f_matrix_space_size =
Expand Down
32 changes: 25 additions & 7 deletions client_example/06_softmax/softmax4d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,35 @@ int main(int argc, char* argv[])
SimpleDeviceMem in(sizeof(InDataType) * num_elements);
SimpleDeviceMem out(sizeof(OutDataType) * num_elements);

using DeviceOp = ck::tensor_operation::device::
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();

auto& generic_op_ptr = op_ptrs[0];

auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
in_strides,
reduce_dims,
alpha,
beta,
in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
PassThrough{},
PassThrough{});

if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
{
throw std::runtime_error(
"The generic kernel instance should be able to support any input shapes");
};

std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

std::string best_op_name;
Expand All @@ -74,11 +97,6 @@ int main(int argc, char* argv[])
{
auto& op_ptr = op_ptrs[i];

if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
{
continue;
}

auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
in_strides,
reduce_dims,
Expand Down
24 changes: 24 additions & 0 deletions client_example/18_groupnorm/groupnorm_swish.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,30 @@ int main(int argc, char* argv[])

std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

const auto& generic_op_ptr = op_ptrs[0];

auto generic_argument_ptr =
generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
xy_strides, // xStrides
gamma_beta_strides, // gammaStrides
gamma_beta_strides, // betaStrides
xy_strides, // yStrides
{1, 2, 4}, // reduceDims
1e-6,
x_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer(),
y_device_buf.GetDeviceBuffer(),
nullptr,
nullptr,
Swish{});

if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
{
throw std::runtime_error(
"The generic kernel instance should be able to support any input shapes");
};

std::string best_op_name;
bool found = false;
int best_op_id = -1;
Expand Down
17 changes: 11 additions & 6 deletions include/ck/tensor_operation/gpu/device/device_softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ template <typename InDataType,
typename OutDataType,
typename InElementwiseOp,
typename AccElementwiseOp,
index_t Rank>
index_t Rank,
index_t NumReduceDim>
struct DeviceSoftmax : public BaseOperator
{
//
Expand Down Expand Up @@ -49,18 +50,22 @@ struct DeviceSoftmax : public BaseOperator
AccElementwiseOp acc_elementwise_op) = 0;

virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
virtual index_t GetRank() const = 0;
virtual index_t GetNumReduceDim() const = 0;
};

template <typename InDataType,
typename AccDataType,
typename OutDataType,
typename InElementwiseOp,
typename AccElementwiseOp,
index_t Rank>
using DeviceSoftmaxPtr = std::unique_ptr<
DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
index_t Rank,
index_t NumReduceDim>
using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
InElementwiseOp,
AccElementwiseOp,
Rank,
NumReduceDim>>;

} // namespace device
} // namespace tensor_operation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDataType,
InElementwiseOp,
AccElementwiseOp,
Rank>
Rank,
NumReduceDim>
{
static constexpr index_t kRank = Rank;
static constexpr index_t kNumReduceDim = NumReduceDim;
static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;

virtual index_t GetRank() const override { return kRank; }

virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }

static constexpr index_t NumInvariantDim = Rank - NumReduceDim;

static constexpr index_t NumSrcDim = Rank;
Expand Down Expand Up @@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
{
if constexpr(InSrcVectorDim == 0)
{
if constexpr(kNumInvariantDim == 0)
if constexpr(NumInvariantDim == 0)
{
return false;
}
else
{
if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
{
return false;
}
Expand All @@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
}

// To improve
if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
{
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@

#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"

namespace ck {
namespace tensor_operation {
Expand All @@ -29,20 +28,34 @@ template <typename InputType,
typename GammaDataType,
typename BetaDataType,
typename OutputType>
auto get_device_normalize_from_mean_meansquare_instances()
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
ck::Tuple<OutputType>,
Normalize,
2>>
{
std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
using DeviceOp = DeviceElementwise<
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
ck::Tuple<OutputType>,
Normalize,
2>;

if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
static auto GetInstances()
{
ck::tensor_operation::device::instance::
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
}

return op_ptrs;
}
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
is_same<MeanSquareType, float>::value &&
is_same<GammaDataType, half_t>::value &&
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
{
ck::tensor_operation::device::instance::
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
}

return op_ptrs;
};
};

} // namespace instance
} // namespace device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,33 @@
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"

namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {

void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);

void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);

void add_device_softmax_i8_i8_rank3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
void add_device_softmax_i8_i8_rank4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);

template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
template <typename InDataType,
typename AccDataType,
typename OutDataType,
index_t Rank,
index_t NumReduceDim>
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>>
{
using DeviceOp =
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
using DeviceOp = DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;

static auto GetInstances()
{
Expand All @@ -46,25 +45,49 @@ struct DeviceOperationInstanceFactory<
std::is_same_v<OutDataType, F16>)
{
if constexpr(Rank == 3)
add_device_softmax_f16_f16_rank3_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
}
else if constexpr(Rank == 4)
add_device_softmax_f16_f16_rank4_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
else if constexpr(NumReduceDim == 4)
add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
}
}
else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
std::is_same_v<OutDataType, F32>)
{
if constexpr(Rank == 3)
add_device_softmax_f32_f32_rank3_instances(op_ptrs);
else if constexpr(Rank == 4)
add_device_softmax_f32_f32_rank4_instances(op_ptrs);
}
else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
std::is_same_v<OutDataType, I8>)
{
if constexpr(Rank == 3)
add_device_softmax_i8_i8_rank3_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
}
else if constexpr(Rank == 4)
add_device_softmax_i8_i8_rank4_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
else if constexpr(NumReduceDim == 4)
add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
}
}

return op_ptrs;
Expand Down

This file was deleted.

Loading