From 240900b1c086f480e99f8cc2deab62c20709a2ff Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Tue, 30 May 2023 21:17:54 +0000 Subject: [PATCH 01/14] Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting --- client_example/06_softmax/softmax4d.cpp | 14 +- .../gpu/device/device_softmax.hpp | 17 +- .../gpu/device/impl/device_softmax_impl.hpp | 17 +- .../tensor_operation_instance/gpu/softmax.hpp | 101 ++++++--- .../device_softmax_f16_f16_instance.hpp | 22 -- ...softmax_f16_f16_instance_rank3_reduce1.hpp | 2 +- ...softmax_f16_f16_instance_rank3_reduce2.hpp | 2 +- ...softmax_f16_f16_instance_rank3_reduce3.hpp | 2 +- ...softmax_f16_f16_instance_rank4_reduce1.hpp | 2 +- ...softmax_f16_f16_instance_rank4_reduce2.hpp | 2 +- ...softmax_f16_f16_instance_rank4_reduce3.hpp | 2 +- ...softmax_f16_f16_instance_rank4_reduce4.hpp | 2 +- .../device_softmax_f32_f32_instance.hpp | 22 -- ...softmax_f32_f32_instance_rank3_reduce1.hpp | 2 +- ...softmax_f32_f32_instance_rank3_reduce2.hpp | 2 +- ...softmax_f32_f32_instance_rank3_reduce3.hpp | 2 +- ...softmax_f32_f32_instance_rank4_reduce1.hpp | 2 +- ...softmax_f32_f32_instance_rank4_reduce2.hpp | 2 +- ...softmax_f32_f32_instance_rank4_reduce3.hpp | 2 +- ...softmax_f32_f32_instance_rank4_reduce4.hpp | 2 +- .../softmax/device_softmax_i8_i8_instance.hpp | 22 -- ...e_softmax_i8_i8_instance_rank3_reduce1.hpp | 2 +- ...e_softmax_i8_i8_instance_rank3_reduce2.hpp | 2 +- ...e_softmax_i8_i8_instance_rank3_reduce3.hpp | 2 +- ...e_softmax_i8_i8_instance_rank4_reduce1.hpp | 2 +- ...e_softmax_i8_i8_instance_rank4_reduce2.hpp | 2 +- ...e_softmax_i8_i8_instance_rank4_reduce3.hpp | 2 +- ...e_softmax_i8_i8_instance_rank4_reduce4.hpp | 2 +- .../gpu/softmax/device_softmax_instance.hpp | 24 ++- .../gpu/softmax/CMakeLists.txt | 3 - .../device_softmax_f16_f16_instance.cpp | 40 ---- ...softmax_f16_f16_instance_rank3_reduce1.cpp | 6 +- ...softmax_f16_f16_instance_rank3_reduce2.cpp | 6 +- ...softmax_f16_f16_instance_rank3_reduce3.cpp | 6 +- ...softmax_f16_f16_instance_rank4_reduce1.cpp | 6 +- ...softmax_f16_f16_instance_rank4_reduce2.cpp | 6 +- ...softmax_f16_f16_instance_rank4_reduce3.cpp | 6 +- ...softmax_f16_f16_instance_rank4_reduce4.cpp | 6 +- .../device_softmax_f32_f32_instance.cpp | 40 ---- ...softmax_f32_f32_instance_rank3_reduce1.cpp | 6 +- ...softmax_f32_f32_instance_rank3_reduce2.cpp | 6 +- ...softmax_f32_f32_instance_rank3_reduce3.cpp | 6 +- ...softmax_f32_f32_instance_rank4_reduce1.cpp | 6 +- ...softmax_f32_f32_instance_rank4_reduce2.cpp | 6 +- ...softmax_f32_f32_instance_rank4_reduce3.cpp | 6 +- ...softmax_f32_f32_instance_rank4_reduce4.cpp | 6 +- .../softmax/device_softmax_i8_i8_instance.cpp | 40 ---- ...e_softmax_i8_i8_instance_rank3_reduce1.cpp | 6 +- ...e_softmax_i8_i8_instance_rank3_reduce2.cpp | 6 +- ...e_softmax_i8_i8_instance_rank3_reduce3.cpp | 6 +- ...e_softmax_i8_i8_instance_rank4_reduce1.cpp | 6 +- ...e_softmax_i8_i8_instance_rank4_reduce2.cpp | 6 +- ...e_softmax_i8_i8_instance_rank4_reduce3.cpp | 6 +- ...e_softmax_i8_i8_instance_rank4_reduce4.cpp | 6 +- .../include/profiler/profile_softmax_impl.hpp | 30 ++- profiler/src/profile_softmax.cpp | 191 ++++++++++++++---- test/softmax/test_softmax_util.hpp | 88 +++++++- 57 files changed, 441 insertions(+), 398 deletions(-) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp index e939ce8dfed..aef5624cadc 100644 --- a/client_example/06_softmax/softmax4d.cpp +++ b/client_example/06_softmax/softmax4d.cpp @@ -53,8 +53,13 @@ int main(int argc, char* argv[]) SimpleDeviceMem in(sizeof(InDataType) * num_elements); SimpleDeviceMem out(sizeof(OutDataType) * num_elements); - using DeviceOp = ck::tensor_operation::device:: - DeviceSoftmax; + using DeviceOp = ck::tensor_operation::device::DeviceSoftmax; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); @@ -74,11 +79,6 @@ int main(int argc, char* argv[]) { auto& op_ptr = op_ptrs[i]; - if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim) - { - continue; - } - auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, in_strides, reduce_dims, diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp index 94f788e5177..1ac746c3f46 100644 --- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp +++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp @@ -18,7 +18,8 @@ template + index_t Rank, + index_t NumReduceDim> struct DeviceSoftmax : public BaseOperator { // @@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator AccElementwiseOp acc_elementwise_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; - virtual index_t GetRank() const = 0; - virtual index_t GetNumReduceDim() const = 0; }; template -using DeviceSoftmaxPtr = std::unique_ptr< - DeviceSoftmax>; + index_t Rank, + index_t NumReduceDim> +using DeviceSoftmaxPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp index ed96b7340cf..fd2577913e2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax + Rank, + NumReduceDim> { - static constexpr index_t kRank = Rank; - static constexpr index_t kNumReduceDim = NumReduceDim; - static constexpr index_t kNumInvariantDim = Rank - NumReduceDim; - - virtual index_t GetRank() const override { return kRank; } - - virtual index_t GetNumReduceDim() const override { return kNumReduceDim; } - static constexpr index_t NumInvariantDim = Rank - NumReduceDim; static constexpr index_t NumSrcDim = Rank; @@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0) + if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0) { return false; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp index 36eb092f0f0..b4e816d822a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -9,34 +9,33 @@ #include "ck/ck.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -void add_device_softmax_f16_f16_rank3_instances( - std::vector>&); -void add_device_softmax_f16_f16_rank4_instances( - std::vector>&); - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>&); -void add_device_softmax_f32_f32_rank4_instances( - std::vector>&); - -void add_device_softmax_i8_i8_rank3_instances( - std::vector>&); -void add_device_softmax_i8_i8_rank4_instances( - std::vector>&); - -template -struct DeviceOperationInstanceFactory< - ck::tensor_operation::device:: - DeviceSoftmax> +template +struct DeviceOperationInstanceFactory> { - using DeviceOp = - DeviceSoftmax; + using DeviceOp = DeviceSoftmax; static auto GetInstances() { @@ -46,25 +45,73 @@ struct DeviceOperationInstanceFactory< std::is_same_v) { if constexpr(Rank == 3) - add_device_softmax_f16_f16_rank3_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs); + } else if constexpr(Rank == 4) - add_device_softmax_f16_f16_rank4_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs); + else if constexpr(NumReduceDim == 4) + add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs); + } } else if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) { if constexpr(Rank == 3) - add_device_softmax_f32_f32_rank3_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs); + } else if constexpr(Rank == 4) - add_device_softmax_f32_f32_rank4_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs); + else if constexpr(NumReduceDim == 4) + add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs); + } } else if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) { if constexpr(Rank == 3) - add_device_softmax_i8_i8_rank3_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs); + } else if constexpr(Rank == 4) - add_device_softmax_i8_i8_rank4_instances(op_ptrs); + { + if constexpr(NumReduceDim == 1) + add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs); + else if constexpr(NumReduceDim == 2) + add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs); + else if constexpr(NumReduceDim == 3) + add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs); + else if constexpr(NumReduceDim == 4) + add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs); + } } return op_ptrs; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp deleted file mode 100644 index 83f52fc3ee7..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f16_f16_rank3_instances( - std::vector>& instances); -void add_device_softmax_f16_f16_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp index 046ff578055..868d3b72122 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank3_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp index 8e6a226f6a1..b6d422e7b8e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank3_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp index 518fa5f9867..88dce37b62e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank3_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp index 10016cdd707..9e2783a9bd7 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank4_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp index cdd5a3cd7b6..d7fc62ca9d4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank4_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp index a8be272e020..f5f1143fb90 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank4_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp index ec8296ff22f..85fbef53b76 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f16_f16_rank4_reduce4_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp deleted file mode 100644 index a6d9a359f46..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>& instances); -void add_device_softmax_f32_f32_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp index 6621a2c867a..4cd3db2ab97 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank3_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp index 3dfac98ed8b..20cfbd43af2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank3_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp index 6d2a0c93250..e3ad524762e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank3_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp index 97dd3dcb18a..8a6f8b4206a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank4_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp index 58f8760accc..f3a4e6b9887 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank4_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp index df8d31f0da7..0721357f588 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank4_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp index 1bd773227e1..a479be7c6ea 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_f32_f32_rank4_reduce4_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp deleted file mode 100644 index f80f712ff5e..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_instances( - std::vector>& instances); -void add_device_softmax_i8_i8_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp index 6f9952e7d58..5d9cbcee2be 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank3_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp index 2cbd13a1ba5..e2fb5190704 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank3_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp index 7b12522a859..10f5fb8918c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank3_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp index 54d477f80c5..82127e99279 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank4_reduce1_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp index 4ffc44e3a92..b1a5caf404a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank4_reduce2_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp index 08cbb81272f..654d33ca772 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank4_reduce3_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp index 187d034b95a..4db2c687a6c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp @@ -14,7 +14,7 @@ namespace device { namespace instance { void add_device_softmax_i8_i8_rank4_reduce4_instances( - std::vector>& instances); + std::vector>& instances); } // namespace instance } // namespace device diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp index 03be6e2bc7c..86233b9ce71 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp @@ -3,6 +3,24 @@ #pragma once -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt index fc13261a6a7..2a96a8570dc 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt @@ -1,5 +1,4 @@ add_instance_library(device_softmax_instance - device_softmax_i8_i8_instance.cpp device_softmax_i8_i8_instance_rank3_reduce1.cpp device_softmax_i8_i8_instance_rank3_reduce2.cpp device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -7,7 +6,6 @@ add_instance_library(device_softmax_instance device_softmax_i8_i8_instance_rank4_reduce2.cpp device_softmax_i8_i8_instance_rank4_reduce3.cpp device_softmax_i8_i8_instance_rank4_reduce4.cpp - device_softmax_f16_f16_instance.cpp device_softmax_f16_f16_instance_rank3_reduce1.cpp device_softmax_f16_f16_instance_rank3_reduce2.cpp device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -15,7 +13,6 @@ add_instance_library(device_softmax_instance device_softmax_f16_f16_instance_rank4_reduce2.cpp device_softmax_f16_f16_instance_rank4_reduce3.cpp device_softmax_f16_f16_instance_rank4_reduce4.cpp - device_softmax_f32_f32_instance.cpp device_softmax_f32_f32_instance_rank3_reduce1.cpp device_softmax_f32_f32_instance_rank3_reduce2.cpp device_softmax_f32_f32_instance_rank3_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp deleted file mode 100644 index 14d2764529c..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f16_f16_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_f16_f16_rank3_reduce1_instances(instances); - add_device_softmax_f16_f16_rank3_reduce2_instances(instances); - add_device_softmax_f16_f16_rank3_reduce3_instances(instances); -} - -void add_device_softmax_f16_f16_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_f16_f16_rank4_reduce1_instances(instances); - add_device_softmax_f16_f16_rank4_reduce2_instances(instances); - add_device_softmax_f16_f16_rank4_reduce3_instances(instances); - add_device_softmax_f16_f16_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp index fa334b997c2..3c7c5cb1291 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f16_f16_rank3_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp index 1c9d37d8483..2ce22a97730 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f16_f16_rank3_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp index 5fbdab5055e..5ce03f02e2c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f16_f16_rank3_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp index 7dd8640b187..c020aa341d1 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f16_f16_rank4_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp index b32fe6838f8..0a3b0978a18 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f16_f16_rank4_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp index c05048ec567..cfa0375c09d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f16_f16_rank4_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp index 6a235708bd4..679d6a6364d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f16_f16_rank4_reduce4_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); + add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp deleted file mode 100644 index e5bec5e2639..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_f32_f32_rank3_reduce1_instances(instances); - add_device_softmax_f32_f32_rank3_reduce2_instances(instances); - add_device_softmax_f32_f32_rank3_reduce3_instances(instances); -} - -void add_device_softmax_f32_f32_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_f32_f32_rank4_reduce1_instances(instances); - add_device_softmax_f32_f32_rank4_reduce2_instances(instances); - add_device_softmax_f32_f32_rank4_reduce3_instances(instances); - add_device_softmax_f32_f32_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp index 57d3f184a66..17dfbb54698 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f32_f32_rank3_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp index fae3a4dd666..03127397044 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f32_f32_rank3_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp index b6fb70e8e2a..cc9efe1c858 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_f32_f32_rank3_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp index 33c7b6f35f3..a352082990d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f32_f32_rank4_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp index c22aa574b1f..ec1619a71d2 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f32_f32_rank4_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp index 55f3d2bd207..a0cf3d08587 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f32_f32_rank4_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp index fb0bcf5ee8a..e8b5bc87a5b 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_f32_f32_rank4_reduce4_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); + add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp deleted file mode 100644 index 608cfcf8380..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_i8_i8_rank3_reduce1_instances(instances); - add_device_softmax_i8_i8_rank3_reduce2_instances(instances); - add_device_softmax_i8_i8_rank3_reduce3_instances(instances); -} - -void add_device_softmax_i8_i8_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_i8_i8_rank4_reduce1_instances(instances); - add_device_softmax_i8_i8_rank4_reduce2_instances(instances); - add_device_softmax_i8_i8_rank4_reduce3_instances(instances); - add_device_softmax_i8_i8_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp index 15552dbae5d..944e0c3bacd 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_i8_i8_rank3_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp index 67674028860..24da3e58d23 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_i8_i8_rank3_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp index 4b33da93c2e..7febbf23534 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 3; - void add_device_softmax_i8_i8_rank3_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp index fe3b823e889..08b56f01667 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_i8_i8_rank4_reduce1_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp index 8ecdf87d9fe..7f0ebceb82b 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_i8_i8_rank4_reduce2_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp index 35631352040..7145a8d91b0 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_i8_i8_rank4_reduce3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp index aa21a0bf8a8..6118a84ab6d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp @@ -13,12 +13,10 @@ namespace tensor_operation { namespace device { namespace instance { -static constexpr index_t RANK = 4; - void add_device_softmax_i8_i8_rank4_reduce4_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); + add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{}); } } // namespace instance diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp index 96816f53bbb..01305f264ef 100644 --- a/profiler/include/profiler/profile_softmax_impl.hpp +++ b/profiler/include/profiler/profile_softmax_impl.hpp @@ -40,7 +40,11 @@ template <> std::string type_to_string() { return "int8"; } template <> std::string type_to_string() { return "int32"; } // clang-format on -template +template bool profile_softmax_impl(int do_verification, int init_method, bool do_log, @@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification, if(Rank != in_length.size()) { throw std::runtime_error("Input tensor rank is different from template argument Rank!"); - } + }; + + if(NumReduceDim != reduce_dims.size()) + { + throw std::runtime_error( + "Input reduce_dims rank is different from template argument NumReduceDim!"); + }; Tensor in = in_strides.empty() ? Tensor(in_length) : Tensor(in_length, in_strides); @@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification, // add device softmax instances using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using DeviceOp = tensor_operation::device:: - DeviceSoftmax; + using DeviceOp = tensor_operation::device::DeviceSoftmax; // get device op instances const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory< @@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification, for(auto& inst_ptr : instances) { - // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 - // problem to rank 4 kernel) other than invoking IsSupportedArgument()? - if(!(inst_ptr->GetNumReduceDim() == static_cast(reduce_dims.size()))) - { - continue; - } - auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths, in_tensor_strides, reduce_dims, diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp index 78b64dda7d7..48a60a42c33 100644 --- a/profiler/src/profile_softmax.cpp +++ b/profiler/src/profile_softmax.cpp @@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[]) { if(data_type == SoftmaxDataType::F16_F16) { - ck::profiler::profile_softmax_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - double(alpha), - double(beta)); + if(reduce.size() == 1) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 2) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 3) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else + throw std::runtime_error("invalid number of dimensions to reduce"); } else if(data_type == SoftmaxDataType::F32_F32) { - ck::profiler::profile_softmax_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - double(alpha), - double(beta)); + if(reduce.size() == 1) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 2) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 3) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else + throw std::runtime_error("invalid number of dimensions to reduce"); } else { @@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[]) { if(data_type == SoftmaxDataType::F16_F16) { - ck::profiler::profile_softmax_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - double(alpha), - double(beta)); + if(reduce.size() == 1) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 2) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 3) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 4) + ck::profiler::profile_softmax_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else + throw std::runtime_error("invalid number of dimensions to reduce"); } else if(data_type == SoftmaxDataType::F32_F32) { - ck::profiler::profile_softmax_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - double(alpha), - double(beta)); + if(reduce.size() == 1) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 2) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 3) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else if(reduce.size() == 4) + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + double(alpha), + double(beta)); + else + throw std::runtime_error("invalid number of dimensions to reduce"); } else { diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp index 40b300cf992..673b31f6c10 100644 --- a/test/softmax/test_softmax_util.hpp +++ b/test/softmax/test_softmax_util.hpp @@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test int init_method = 1; // integer value initialization bool log = false; std::vector strides; // intenionally empty, to get packed layout. - bool pass = ck::profiler::profile_softmax_impl( - verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta); + bool pass = false; + + if constexpr(Rank == 3) + { + if(reduce_dims.size() == 1) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + else if(reduce_dims.size() == 2) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + else if(reduce_dims.size() == 3) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + } + else if constexpr(Rank == 4) + { + if(reduce_dims.size() == 1) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + else if(reduce_dims.size() == 2) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + else if(reduce_dims.size() == 3) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + else if(reduce_dims.size() == 4) + pass = ck::profiler:: + profile_softmax_impl(verify_, + init_method, + log, + bench_, + in_length, + strides, + reduce_dims, + alpha, + beta); + }; + EXPECT_TRUE(pass); } From de0bb3c26de393d56d17f64e2f064b2c6a7be84c Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Tue, 30 May 2023 22:20:46 +0000 Subject: [PATCH 02/14] Move the generic kernel instance to be the first of the instance list for elementwise op of normalization --- .../gpu/elementwise/device_normalize_instance.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index 182037f15c6..ff846d400a2 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -28,10 +28,10 @@ using Normalize = ck::tensor_operation::element_wise::Normalize; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< // clang-format off //###################|| | functor| NDim| MPerThread| | | + DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, DeviceElementwiseImpl, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, DeviceElementwiseImpl, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, - DeviceElementwiseImpl, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >, - DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > + DeviceElementwiseImpl, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> > // clang-format on >; From a9f0d000eb9fd240404112a526ef125429a351df Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 31 May 2023 20:03:24 +0000 Subject: [PATCH 03/14] Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax --- .../add_device_operation_instance.hpp | 14 ++++ .../tensor_operation_instance/gpu/softmax.hpp | 80 +++++++++++++++++++ ...softmax_f16_f16_instance_rank3_reduce1.hpp | 3 + ...softmax_f16_f16_instance_rank3_reduce2.hpp | 3 + ...softmax_f16_f16_instance_rank3_reduce3.hpp | 3 + ...softmax_f16_f16_instance_rank4_reduce1.hpp | 3 + ...softmax_f16_f16_instance_rank4_reduce2.hpp | 3 + ...softmax_f16_f16_instance_rank4_reduce3.hpp | 3 + ...softmax_f16_f16_instance_rank4_reduce4.hpp | 3 + ...softmax_f32_f32_instance_rank3_reduce1.hpp | 3 + ...softmax_f32_f32_instance_rank3_reduce2.hpp | 3 + ...softmax_f32_f32_instance_rank3_reduce3.hpp | 3 + ...softmax_f32_f32_instance_rank4_reduce1.hpp | 3 + ...softmax_f32_f32_instance_rank4_reduce2.hpp | 3 + ...softmax_f32_f32_instance_rank4_reduce3.hpp | 3 + ...softmax_f32_f32_instance_rank4_reduce4.hpp | 3 + ...e_softmax_i8_i8_instance_rank3_reduce1.hpp | 3 + ...e_softmax_i8_i8_instance_rank3_reduce2.hpp | 3 + ...e_softmax_i8_i8_instance_rank3_reduce3.hpp | 3 + ...e_softmax_i8_i8_instance_rank4_reduce1.hpp | 3 + ...e_softmax_i8_i8_instance_rank4_reduce2.hpp | 3 + ...e_softmax_i8_i8_instance_rank4_reduce3.hpp | 3 + ...e_softmax_i8_i8_instance_rank4_reduce4.hpp | 3 + ...softmax_f16_f16_instance_rank3_reduce1.cpp | 6 ++ ...softmax_f16_f16_instance_rank3_reduce2.cpp | 6 ++ ...softmax_f16_f16_instance_rank3_reduce3.cpp | 6 ++ ...softmax_f16_f16_instance_rank4_reduce1.cpp | 6 ++ ...softmax_f16_f16_instance_rank4_reduce2.cpp | 6 ++ ...softmax_f16_f16_instance_rank4_reduce3.cpp | 6 ++ ...softmax_f16_f16_instance_rank4_reduce4.cpp | 6 ++ ...softmax_f32_f32_instance_rank3_reduce1.cpp | 6 ++ ...softmax_f32_f32_instance_rank3_reduce2.cpp | 6 ++ ...softmax_f32_f32_instance_rank3_reduce3.cpp | 6 ++ ...softmax_f32_f32_instance_rank4_reduce1.cpp | 6 ++ ...softmax_f32_f32_instance_rank4_reduce2.cpp | 6 ++ ...softmax_f32_f32_instance_rank4_reduce3.cpp | 6 ++ ...softmax_f32_f32_instance_rank4_reduce4.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank3_reduce1.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank3_reduce2.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank3_reduce3.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank4_reduce1.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank4_reduce2.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank4_reduce3.cpp | 6 ++ ...e_softmax_i8_i8_instance_rank4_reduce4.cpp | 6 ++ 44 files changed, 283 insertions(+) diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp index 20df1b3616a..ab83fb373a8 100644 --- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp @@ -29,6 +29,20 @@ void add_device_operation_instances(std::vector>& op_ins }); } +template +void get_first_device_operation_instance(std::unique_ptr& op_instance, + const NewOpInstances& new_op_instances) +{ + const auto first_op_instance = std::get<0>(new_op_instances); + + using FirstOpInstance = remove_cvref_t; + + static_assert(std::is_base_of_v, + "wrong! FirstOpInstance should be derived from BaseOp"); + + op_instance = std::make_unique(first_op_instance); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp index b4e816d822a..6e1f58cb92e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -37,6 +37,86 @@ struct DeviceOperationInstanceFactory; + static auto GetGenericInstance() + { + std::unique_ptr op_ptr; + + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_f16_f16_rank3_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_f16_f16_rank3_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_f16_f16_rank3_reduce3_generic_instance(op_ptr); + } + else if constexpr(Rank == 4) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_f16_f16_rank4_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_f16_f16_rank4_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_f16_f16_rank4_reduce3_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 4) + get_device_softmax_f16_f16_rank4_reduce4_generic_instance(op_ptr); + } + } + else if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_f32_f32_rank3_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_f32_f32_rank3_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_f32_f32_rank3_reduce3_generic_instance(op_ptr); + } + else if constexpr(Rank == 4) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_f32_f32_rank4_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_f32_f32_rank4_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_f32_f32_rank4_reduce3_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 4) + get_device_softmax_f32_f32_rank4_reduce4_generic_instance(op_ptr); + } + } + else if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_i8_i8_rank3_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_i8_i8_rank3_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_i8_i8_rank3_reduce3_generic_instance(op_ptr); + } + else if constexpr(Rank == 4) + { + if constexpr(NumReduceDim == 1) + get_device_softmax_i8_i8_rank4_reduce1_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 2) + get_device_softmax_i8_i8_rank4_reduce2_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 3) + get_device_softmax_i8_i8_rank4_reduce3_generic_instance(op_ptr); + else if constexpr(NumReduceDim == 4) + get_device_softmax_i8_i8_rank4_reduce4_generic_instance(op_ptr); + } + } + + return op_ptr; + }; + static auto GetInstances() { std::vector> op_ptrs; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp index 868d3b72122..0f75242f1ca 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce1_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp index b6d422e7b8e..80b7a83cdb6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce2_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp index 88dce37b62e..4721b56e6cc 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce3_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank3_reduce3_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp index 9e2783a9bd7..fd9ca5e57c8 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce1_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp index d7fc62ca9d4..6b03ab9595b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce2_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp index f5f1143fb90..f62d3310f11 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce3_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp index 85fbef53b76..17a60b32359 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce4_instances( std::vector>& instances); +void get_device_softmax_f16_f16_rank4_reduce4_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp index 4cd3db2ab97..3c8fae9a41d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce1_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp index 20cfbd43af2..0138c7312e2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce2_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp index e3ad524762e..063341cd69d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce3_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank3_reduce3_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp index 8a6f8b4206a..c740619986e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce1_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp index f3a4e6b9887..89bb27b1974 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce2_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp index 0721357f588..e1aada99ebf 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce3_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp index a479be7c6ea..253dde1404e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce4_instances( std::vector>& instances); +void get_device_softmax_f32_f32_rank4_reduce4_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp index 5d9cbcee2be..0634d808ce5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce1_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp index e2fb5190704..7fc05eafb27 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce2_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp index 10f5fb8918c..1591a4740dd 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce3_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank3_reduce3_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp index 82127e99279..a0b94f84e29 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce1_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp index b1a5caf404a..6cda277fb09 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce2_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp index 654d33ca772..fefecf1f757 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce3_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp index 4db2c687a6c..f9dc9046205 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp @@ -16,6 +16,9 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce4_instances( std::vector>& instances); +void get_device_softmax_i8_i8_rank4_reduce4_generic_instance( + DeviceSoftmaxPtr& instance); + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp index 3c7c5cb1291..407d791ad13 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); } +void get_device_softmax_f16_f16_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp index 2ce22a97730..9582ec3decc 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); } +void get_device_softmax_f16_f16_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp index 5ce03f02e2c..eada467f4f9 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{}); } +void get_device_softmax_f16_f16_rank3_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp index c020aa341d1..b1eae23539e 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); } +void get_device_softmax_f16_f16_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp index 0a3b0978a18..d35b97d04cb 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); } +void get_device_softmax_f16_f16_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp index cfa0375c09d..24d9ebae0b5 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{}); } +void get_device_softmax_f16_f16_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp index 679d6a6364d..373a96f6a8c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{}); } +void get_device_softmax_f16_f16_rank4_reduce4_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 4>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp index 17dfbb54698..cfe838c547c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); } +void get_device_softmax_f32_f32_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp index 03127397044..0d2d6cb3ccd 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); } +void get_device_softmax_f32_f32_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp index cc9efe1c858..315389966e8 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{}); } +void get_device_softmax_f32_f32_rank3_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp index a352082990d..337b6b3aa28 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); } +void get_device_softmax_f32_f32_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp index ec1619a71d2..8ccb0b739a5 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); } +void get_device_softmax_f32_f32_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp index a0cf3d08587..f8b12112f61 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{}); } +void get_device_softmax_f32_f32_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp index e8b5bc87a5b..96dbd1cc3a2 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp @@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{}); } +void get_device_softmax_f32_f32_rank4_reduce4_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 4>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp index 944e0c3bacd..4f2799c1c11 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{}); } +void get_device_softmax_i8_i8_rank3_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp index 24da3e58d23..b2c592aab82 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{}); } +void get_device_softmax_i8_i8_rank3_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp index 7febbf23534..3603fca7220 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{}); } +void get_device_softmax_i8_i8_rank3_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp index 08b56f01667..a0f2ca2f73a 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{}); } +void get_device_softmax_i8_i8_rank4_reduce1_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 1>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp index 7f0ebceb82b..4788d6d12bf 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{}); } +void get_device_softmax_i8_i8_rank4_reduce2_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 2>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp index 7145a8d91b0..ba1f06629be 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{}); } +void get_device_softmax_i8_i8_rank4_reduce3_generic_instance( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 3>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp index 6118a84ab6d..efaf9c02acc 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp @@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{}); } +void add_device_softmax_i8_i8_rank4_reduce4_instances( + DeviceSoftmaxPtr& instance) +{ + get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 4>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation From f629cd9a93ce38dfed4886d849f3c38d2e5379c8 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 31 May 2023 20:05:54 +0000 Subject: [PATCH 04/14] Add testing of GetGenericInstance() in client_example of Softmax --- client_example/06_softmax/softmax4d.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp index aef5624cadc..401b161d116 100644 --- a/client_example/06_softmax/softmax4d.cpp +++ b/client_example/06_softmax/softmax4d.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -60,6 +61,24 @@ int main(int argc, char* argv[]) PassThrough, Rank, NumReduceDim>; + + const auto g_op_ptr = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetGenericInstance(); + + auto g_op_argument_ptr = g_op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + + if(!g_op_ptr->IsSupportedArgument(g_op_argument_ptr.get())) + throw std::runtime_error( + "Generic instance should be suitable for various input lengths/strides"); + // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); @@ -122,6 +141,7 @@ int main(int argc, char* argv[]) << best_op_name << std::endl; // run the best intance + if(found) { auto& op_ptr = op_ptrs[best_op_id]; std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() From 49a1c097c6d24472e694dcb02390459d349b20e0 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Mon, 5 Jun 2023 18:43:51 +0000 Subject: [PATCH 05/14] Revert "Add testing of GetGenericInstance() in client_example of Softmax" This reverts commit f629cd9a93ce38dfed4886d849f3c38d2e5379c8. --- client_example/06_softmax/softmax4d.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp index 4f399b0e6a0..21c226c1ab7 100644 --- a/client_example/06_softmax/softmax4d.cpp +++ b/client_example/06_softmax/softmax4d.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -61,24 +60,6 @@ int main(int argc, char* argv[]) PassThrough, Rank, NumReduceDim>; - - const auto g_op_ptr = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetGenericInstance(); - - auto g_op_argument_ptr = g_op_ptr->MakeArgumentPointer(in_lengths, - in_strides, - reduce_dims, - alpha, - beta, - in.GetDeviceBuffer(), - out.GetDeviceBuffer(), - PassThrough{}, - PassThrough{}); - - if(!g_op_ptr->IsSupportedArgument(g_op_argument_ptr.get())) - throw std::runtime_error( - "Generic instance should be suitable for various input lengths/strides"); - // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); @@ -141,7 +122,6 @@ int main(int argc, char* argv[]) << best_op_name << std::endl; // run the best intance - if(found) { auto& op_ptr = op_ptrs[best_op_id]; std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() From 3c0e60bcd1c8d4de3083e6bc7b0d93602c051d16 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Mon, 5 Jun 2023 18:44:18 +0000 Subject: [PATCH 06/14] Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax" This reverts commit a9f0d000eb9fd240404112a526ef125429a351df. --- .../add_device_operation_instance.hpp | 14 ---- .../tensor_operation_instance/gpu/softmax.hpp | 80 ------------------- ...softmax_f16_f16_instance_rank3_reduce1.hpp | 3 - ...softmax_f16_f16_instance_rank3_reduce2.hpp | 3 - ...softmax_f16_f16_instance_rank3_reduce3.hpp | 3 - ...softmax_f16_f16_instance_rank4_reduce1.hpp | 3 - ...softmax_f16_f16_instance_rank4_reduce2.hpp | 3 - ...softmax_f16_f16_instance_rank4_reduce3.hpp | 3 - ...softmax_f16_f16_instance_rank4_reduce4.hpp | 3 - ...softmax_f32_f32_instance_rank3_reduce1.hpp | 3 - ...softmax_f32_f32_instance_rank3_reduce2.hpp | 3 - ...softmax_f32_f32_instance_rank3_reduce3.hpp | 3 - ...softmax_f32_f32_instance_rank4_reduce1.hpp | 3 - ...softmax_f32_f32_instance_rank4_reduce2.hpp | 3 - ...softmax_f32_f32_instance_rank4_reduce3.hpp | 3 - ...softmax_f32_f32_instance_rank4_reduce4.hpp | 3 - ...e_softmax_i8_i8_instance_rank3_reduce1.hpp | 3 - ...e_softmax_i8_i8_instance_rank3_reduce2.hpp | 3 - ...e_softmax_i8_i8_instance_rank3_reduce3.hpp | 3 - ...e_softmax_i8_i8_instance_rank4_reduce1.hpp | 3 - ...e_softmax_i8_i8_instance_rank4_reduce2.hpp | 3 - ...e_softmax_i8_i8_instance_rank4_reduce3.hpp | 3 - ...e_softmax_i8_i8_instance_rank4_reduce4.hpp | 3 - ...softmax_f16_f16_instance_rank3_reduce1.cpp | 6 -- ...softmax_f16_f16_instance_rank3_reduce2.cpp | 6 -- ...softmax_f16_f16_instance_rank3_reduce3.cpp | 6 -- ...softmax_f16_f16_instance_rank4_reduce1.cpp | 6 -- ...softmax_f16_f16_instance_rank4_reduce2.cpp | 6 -- ...softmax_f16_f16_instance_rank4_reduce3.cpp | 6 -- ...softmax_f16_f16_instance_rank4_reduce4.cpp | 6 -- ...softmax_f32_f32_instance_rank3_reduce1.cpp | 6 -- ...softmax_f32_f32_instance_rank3_reduce2.cpp | 6 -- ...softmax_f32_f32_instance_rank3_reduce3.cpp | 6 -- ...softmax_f32_f32_instance_rank4_reduce1.cpp | 6 -- ...softmax_f32_f32_instance_rank4_reduce2.cpp | 6 -- ...softmax_f32_f32_instance_rank4_reduce3.cpp | 6 -- ...softmax_f32_f32_instance_rank4_reduce4.cpp | 6 -- ...e_softmax_i8_i8_instance_rank3_reduce1.cpp | 6 -- ...e_softmax_i8_i8_instance_rank3_reduce2.cpp | 6 -- ...e_softmax_i8_i8_instance_rank3_reduce3.cpp | 6 -- ...e_softmax_i8_i8_instance_rank4_reduce1.cpp | 6 -- ...e_softmax_i8_i8_instance_rank4_reduce2.cpp | 6 -- ...e_softmax_i8_i8_instance_rank4_reduce3.cpp | 6 -- ...e_softmax_i8_i8_instance_rank4_reduce4.cpp | 6 -- 44 files changed, 283 deletions(-) diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp index 4a9132bc026..f57fed9c07c 100644 --- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp @@ -29,20 +29,6 @@ void add_device_operation_instances(std::vector>& op_ins }); } -template -void get_first_device_operation_instance(std::unique_ptr& op_instance, - const NewOpInstances& new_op_instances) -{ - const auto first_op_instance = std::get<0>(new_op_instances); - - using FirstOpInstance = remove_cvref_t; - - static_assert(std::is_base_of_v, - "wrong! FirstOpInstance should be derived from BaseOp"); - - op_instance = std::make_unique(first_op_instance); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp index 5a454cfbe24..3f82b5bfd86 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -37,86 +37,6 @@ struct DeviceOperationInstanceFactory; - static auto GetGenericInstance() - { - std::unique_ptr op_ptr; - - if constexpr(std::is_same_v && std::is_same_v && - std::is_same_v) - { - if constexpr(Rank == 3) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_f16_f16_rank3_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_f16_f16_rank3_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_f16_f16_rank3_reduce3_generic_instance(op_ptr); - } - else if constexpr(Rank == 4) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_f16_f16_rank4_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_f16_f16_rank4_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_f16_f16_rank4_reduce3_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 4) - get_device_softmax_f16_f16_rank4_reduce4_generic_instance(op_ptr); - } - } - else if constexpr(std::is_same_v && std::is_same_v && - std::is_same_v) - { - if constexpr(Rank == 3) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_f32_f32_rank3_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_f32_f32_rank3_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_f32_f32_rank3_reduce3_generic_instance(op_ptr); - } - else if constexpr(Rank == 4) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_f32_f32_rank4_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_f32_f32_rank4_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_f32_f32_rank4_reduce3_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 4) - get_device_softmax_f32_f32_rank4_reduce4_generic_instance(op_ptr); - } - } - else if constexpr(std::is_same_v && std::is_same_v && - std::is_same_v) - { - if constexpr(Rank == 3) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_i8_i8_rank3_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_i8_i8_rank3_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_i8_i8_rank3_reduce3_generic_instance(op_ptr); - } - else if constexpr(Rank == 4) - { - if constexpr(NumReduceDim == 1) - get_device_softmax_i8_i8_rank4_reduce1_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 2) - get_device_softmax_i8_i8_rank4_reduce2_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 3) - get_device_softmax_i8_i8_rank4_reduce3_generic_instance(op_ptr); - else if constexpr(NumReduceDim == 4) - get_device_softmax_i8_i8_rank4_reduce4_generic_instance(op_ptr); - } - } - - return op_ptr; - }; - static auto GetInstances() { std::vector> op_ptrs; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp index a5bedf47d9f..3fd2bd089ed 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce1_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp index 5d105706c58..210fdc0a585 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce2_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp index 20b6281b40d..894fb034d0d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce3_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank3_reduce3_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp index ffc40eefd20..708ef0ce130 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce1_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp index e435c5ffb80..6754e5ceffa 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce2_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp index 89ace5fa165..5e111176e19 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce3_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp index 5007b76c074..a3cecb32f83 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce4_instances( std::vector>& instances); -void get_device_softmax_f16_f16_rank4_reduce4_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp index 8f1e7491e02..4cc46902533 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce1_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp index a51593a6b9f..65724d7888a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce2_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp index 7f68f9c7d12..13bd45598ec 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce3_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank3_reduce3_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp index a63608f2c49..d58b424ee94 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce1_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp index a93993c66a4..378e45eeb78 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce2_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp index 0a2cb69485f..293df08c7e9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce3_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp index 58d0fe112f1..e503a9fec1f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce4_instances( std::vector>& instances); -void get_device_softmax_f32_f32_rank4_reduce4_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp index e34db34b470..e047bf606ab 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce1_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp index f6c666b9c7b..6945a535ee2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce2_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp index 5de51548dcb..54ef4932e4f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce3_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank3_reduce3_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp index 767991073c1..577485f21da 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce1_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp index 4192fe6cd80..3db80207e38 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce2_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp index 555945856aa..d076beda3d9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce3_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp index e5f87c73128..19b913d859a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp @@ -16,9 +16,6 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce4_instances( std::vector>& instances); -void get_device_softmax_i8_i8_rank4_reduce4_generic_instance( - DeviceSoftmaxPtr& instance); - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp index 69d965fed53..2f77da9efe0 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); } -void get_device_softmax_f16_f16_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp index e09ec97a28f..b1c8c126b55 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); } -void get_device_softmax_f16_f16_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp index 0b13d934af7..898375567e2 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{}); } -void get_device_softmax_f16_f16_rank3_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp index c50c11d6574..2ea196577cf 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); } -void get_device_softmax_f16_f16_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp index b5ef88ea5b3..d373f918b81 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); } -void get_device_softmax_f16_f16_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp index 2f386e87268..07fabead64d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{}); } -void get_device_softmax_f16_f16_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp index 5c7522505b5..36de53bd7c6 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{}); } -void get_device_softmax_f16_f16_rank4_reduce4_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 4>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp index 21b886f075e..4ebcfb7075d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); } -void get_device_softmax_f32_f32_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp index bc5f44a6e0e..0c61d81c5b0 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); } -void get_device_softmax_f32_f32_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp index d785c9f88c5..7670ecf80db 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{}); } -void get_device_softmax_f32_f32_rank3_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp index 83b2e1437ea..fcdc9627194 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); } -void get_device_softmax_f32_f32_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp index 9c3a2401feb..c2faac889e9 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); } -void get_device_softmax_f32_f32_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp index 8067d0d9ef1..ba11fb09084 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{}); } -void get_device_softmax_f32_f32_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp index a4b9f0b2e97..b608ac449ae 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp @@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{}); } -void get_device_softmax_f32_f32_rank4_reduce4_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 4>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp index 6118a252361..fe578366101 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce1_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{}); } -void get_device_softmax_i8_i8_rank3_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp index 3d630984724..c3f6b2f823c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce2_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{}); } -void get_device_softmax_i8_i8_rank3_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp index 1e92994dffd..4b372626e5d 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce3_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{}); } -void get_device_softmax_i8_i8_rank3_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp index a0f5c1549e0..876bb5af874 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce1_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{}); } -void get_device_softmax_i8_i8_rank4_reduce1_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 1>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp index c8e4c101197..1539d8a55e7 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce2_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{}); } -void get_device_softmax_i8_i8_rank4_reduce2_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 2>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp index 32e18c30644..1d59752b59e 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce3_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{}); } -void get_device_softmax_i8_i8_rank4_reduce3_generic_instance( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 3>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp index 654878abd39..aecdfe542e4 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp @@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce4_instances( add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{}); } -void add_device_softmax_i8_i8_rank4_reduce4_instances( - DeviceSoftmaxPtr& instance) -{ - get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 4>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation From 43eb43b05e62f2a5e606f5bbe5ae3571d9787dbb Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Tue, 6 Jun 2023 20:18:10 +0000 Subject: [PATCH 07/14] Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm --- .../18_groupnorm/groupnorm_swish.cpp | 24 +++++++++++++++++++ .../device_groupnorm_f16_instance.cpp | 1 + .../device_groupnorm_f32_instance.cpp | 1 + ...oupnorm_swish_f16_f32_f32_f16_instance.cpp | 2 ++ .../device_groupnorm_swish_f16_instance.cpp | 1 + .../device_groupnorm_swish_f32_instance.cpp | 1 + .../device_layernorm2d_f16_instance.cpp | 1 + .../device_layernorm2d_f32_instance.cpp | 1 + .../device_layernorm4d_f16_instance.cpp | 1 + .../device_layernorm4d_f32_instance.cpp | 1 + .../normalization_instance_common.hpp | 21 ++++++++++++++++ 11 files changed, 55 insertions(+) diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp index 308061a3249..e1d198d2282 100644 --- a/client_example/18_groupnorm/groupnorm_swish.cpp +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -72,6 +72,30 @@ int main(int argc, char* argv[]) std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + const auto& generic_op_ptr = op_ptrs[0]; + + auto generic_argument_ptr = + generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get())) + { + throw std::runtime_error( + "The generic kernel instance should be able to support any input shapes"); + }; + std::string best_op_name; bool found = false; int best_op_id = -1; diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp index be860f58e06..775fabaf081 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_f16_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp index 9a64e555d65..a76f085ae4b 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_f32_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp index fe72a27331d..00b5101fdf3 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp @@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( std::vector>>& instances) { + add_device_operation_instances(instances, + device_normalization_f16_f32_f32_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_f32_f32_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp index cac8641e135..736f8b304ce 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_swish_f16_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp index 0a9ac846235..567305598e2 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_swish_f32_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp index ad92818ec2f..ff2a13695fd 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_2_1_f16_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp index 70e3bbc1c1d..62a8fa87da7 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_2_1_f32_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp index 7c5d2c4a9c1..16d0f1f098e 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_4_3_f16_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp index f5626d4a9a3..049fcd7769c 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp @@ -14,6 +14,7 @@ void add_device_normalization_rank_4_3_f32_instances( std::vector>>& instances) { + add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp index d9029ac25e8..b0684962f9e 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp +++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp @@ -43,6 +43,13 @@ using device_normalization_f16_instances = // clang-format on >; +template +using device_normalization_f16_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationImpl + // clang-format on + >; + template using device_normalization_f32_instances = std::tuple< // clang-format off @@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple< // clang-format on >; +template +using device_normalization_f32_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationImpl + // clang-format on + >; + template using device_normalization_f16_f32_f32_f16_instances = std::tuple< // clang-format off @@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple< // clang-format on >; +template +using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationImpl + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation From ab11f5490a4255d185627c4d00b4d92d2d41e0c5 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 7 Jun 2023 14:36:19 +0000 Subject: [PATCH 08/14] Move generic kernel instance to separate tuple for elementwise op of normalization --- .../gpu/elementwise/device_normalize_instance.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index 706d4bac317..f2a5f0728ac 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -28,17 +28,25 @@ using Normalize = ck::tensor_operation::element_wise::Normalize; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< // clang-format off //###################|| | functor| NDim| MPerThread| | | - DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, DeviceElementwiseImpl, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, DeviceElementwiseImpl, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, DeviceElementwiseImpl, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> > // clang-format on >; +using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple< + // clang-format off + DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > + // clang-format on + >; + void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( std::vector, Tuple, Normalize, 2>>& instances) { + add_device_operation_instances( + instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{}); + add_device_operation_instances( instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{}); } From 2b721d83de32fd10b42fa9ad366ef046e0c445ad Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 7 Jun 2023 14:39:31 +0000 Subject: [PATCH 09/14] Remove un-used files for softmax instance --- .../device_softmax_f16_f16_instance.hpp | 22 ---------- .../device_softmax_f32_f32_instance.hpp | 22 ---------- .../softmax/device_softmax_i8_i8_instance.hpp | 22 ---------- .../device_softmax_f16_f16_instance.cpp | 40 ------------------- .../device_softmax_f32_f32_instance.cpp | 40 ------------------- .../softmax/device_softmax_i8_i8_instance.cpp | 40 ------------------- 6 files changed, 186 deletions(-) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp deleted file mode 100644 index 7c6f189cb99..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f16_f16_rank3_instances( - std::vector>& instances); -void add_device_softmax_f16_f16_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp deleted file mode 100644 index 41c67af7ade..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>& instances); -void add_device_softmax_f32_f32_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp deleted file mode 100644 index 3cd3742093f..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_instances( - std::vector>& instances); -void add_device_softmax_i8_i8_rank4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp deleted file mode 100644 index a86da7cc795..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f16_f16_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_f16_f16_rank3_reduce1_instances(instances); - add_device_softmax_f16_f16_rank3_reduce2_instances(instances); - add_device_softmax_f16_f16_rank3_reduce3_instances(instances); -} - -void add_device_softmax_f16_f16_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_f16_f16_rank4_reduce1_instances(instances); - add_device_softmax_f16_f16_rank4_reduce2_instances(instances); - add_device_softmax_f16_f16_rank4_reduce3_instances(instances); - add_device_softmax_f16_f16_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp deleted file mode 100644 index ab8a69eec21..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_f32_f32_rank3_reduce1_instances(instances); - add_device_softmax_f32_f32_rank3_reduce2_instances(instances); - add_device_softmax_f32_f32_rank3_reduce3_instances(instances); -} - -void add_device_softmax_f32_f32_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_f32_f32_rank4_reduce1_instances(instances); - add_device_softmax_f32_f32_rank4_reduce2_instances(instances); - add_device_softmax_f32_f32_rank4_reduce3_instances(instances); - add_device_softmax_f32_f32_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp deleted file mode 100644 index 81a2ff80ca6..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" - -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_instances( - std::vector>& instances) -{ - add_device_softmax_i8_i8_rank3_reduce1_instances(instances); - add_device_softmax_i8_i8_rank3_reduce2_instances(instances); - add_device_softmax_i8_i8_rank3_reduce3_instances(instances); -} - -void add_device_softmax_i8_i8_rank4_instances( - std::vector>& instances) -{ - add_device_softmax_i8_i8_rank4_reduce1_instances(instances); - add_device_softmax_i8_i8_rank4_reduce2_instances(instances); - add_device_softmax_i8_i8_rank4_reduce3_instances(instances); - add_device_softmax_i8_i8_rank4_reduce4_instances(instances); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck From 53a032061c634ab3226871074f7dbaef4b114425 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 7 Jun 2023 15:24:06 +0000 Subject: [PATCH 10/14] Store generic kernel instance to separate tuple for softmax --- .../gpu/softmax/device_softmax_f16_f16_instance_type.hpp | 8 +++++++- .../gpu/softmax/device_softmax_f32_f32_instance_type.hpp | 9 ++++++++- .../gpu/softmax/device_softmax_i8_i8_instance_type.hpp | 8 +++++++- .../device_softmax_f16_f16_instance_rank3_reduce1.cpp | 1 + .../device_softmax_f16_f16_instance_rank3_reduce2.cpp | 1 + .../device_softmax_f16_f16_instance_rank3_reduce3.cpp | 1 + .../device_softmax_f16_f16_instance_rank4_reduce1.cpp | 1 + .../device_softmax_f16_f16_instance_rank4_reduce2.cpp | 1 + .../device_softmax_f16_f16_instance_rank4_reduce3.cpp | 1 + .../device_softmax_f16_f16_instance_rank4_reduce4.cpp | 1 + .../device_softmax_f32_f32_instance_rank3_reduce1.cpp | 1 + .../device_softmax_f32_f32_instance_rank3_reduce2.cpp | 1 + .../device_softmax_f32_f32_instance_rank3_reduce3.cpp | 1 + .../device_softmax_f32_f32_instance_rank4_reduce1.cpp | 1 + .../device_softmax_f32_f32_instance_rank4_reduce2.cpp | 1 + .../device_softmax_f32_f32_instance_rank4_reduce3.cpp | 1 + .../device_softmax_f32_f32_instance_rank4_reduce4.cpp | 1 + .../device_softmax_i8_i8_instance_rank3_reduce1.cpp | 1 + .../device_softmax_i8_i8_instance_rank3_reduce2.cpp | 1 + .../device_softmax_i8_i8_instance_rank3_reduce3.cpp | 1 + .../device_softmax_i8_i8_instance_rank4_reduce1.cpp | 1 + .../device_softmax_i8_i8_instance_rank4_reduce2.cpp | 1 + .../device_softmax_i8_i8_instance_rank4_reduce3.cpp | 1 + .../device_softmax_i8_i8_instance_rank4_reduce4.cpp | 1 + 24 files changed, 43 insertions(+), 3 deletions(-) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp index 53c142f6120..8c0782daa55 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp @@ -16,7 +16,6 @@ template using device_softmax_f16_f16_instances = std::tuple< // clang-format off // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - // fallback kernel DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>, DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>, @@ -33,6 +32,13 @@ using device_softmax_f16_f16_instances = std::tuple< // clang-format on >; +template +using device_softmax_f16_f16_generic_instance = std::tuple< + // clang-format off + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp index a034e41a072..90c5ddc8a01 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp @@ -16,7 +16,7 @@ template using device_softmax_f32_f32_instances = std::tuple< // clang-format off // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>, DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>, DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>, @@ -32,6 +32,13 @@ using device_softmax_f32_f32_instances = std::tuple< // clang-format on >; +template +using device_softmax_f32_f32_generic_instance = std::tuple< + // clang-format off + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp index 6ff07de2360..aa4bf6be3b1 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp @@ -17,7 +17,6 @@ template using device_softmax_i8_i8_instances = std::tuple< // clang-format off // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - // fallback kernel DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>, DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>, DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>, @@ -34,6 +33,13 @@ using device_softmax_i8_i8_instances = std::tuple< // clang-format on >; +template +using device_softmax_i8_i8_generic_instance = std::tuple< + // clang-format off + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1> + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp index 2f77da9efe0..36867d993f9 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp index b1c8c126b55..373f33ad597 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp index 898375567e2..d26b92b4f49 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank3_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp index 2ea196577cf..bbb735b6fe5 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp index d373f918b81..92dbe677603 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp index 07fabead64d..354cda85d75 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp index 36de53bd7c6..edb5e42c103 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f16_f16_rank4_reduce4_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp index 4ebcfb7075d..566be8fc22c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp index 0c61d81c5b0..f9c76e3116c 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp index 7670ecf80db..541e0d71a93 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank3_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp index fcdc9627194..95a38df2834 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp index c2faac889e9..a29b88891d4 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp index ba11fb09084..0da46ea1b47 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp index b608ac449ae..fa217dc3f5b 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_f32_f32_rank4_reduce4_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp index fe578366101..1754d771faf 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 1>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp index c3f6b2f823c..f77d66d6e63 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 2>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp index 4b372626e5d..949d76ac68a 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank3_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 3>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp index 876bb5af874..43c2979854e 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce1_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 1>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp index 1539d8a55e7..08ff41a7565 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce2_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 2>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp index 1d59752b59e..79850251636 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce3_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 3>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{}); } diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp index aecdfe542e4..77b120c7390 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp @@ -16,6 +16,7 @@ namespace instance { void add_device_softmax_i8_i8_rank4_reduce4_instances( std::vector>& instances) { + add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 4>{}); add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{}); } From 0fbff90826a7714c15d39c3ffb686b1ca69d588a Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 7 Jun 2023 15:38:17 +0000 Subject: [PATCH 11/14] Add IsSupported checking for generic instance to client example of softmax --- client_example/06_softmax/softmax4d.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp index 21c226c1ab7..2ccad27a887 100644 --- a/client_example/06_softmax/softmax4d.cpp +++ b/client_example/06_softmax/softmax4d.cpp @@ -64,6 +64,24 @@ int main(int argc, char* argv[]) const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); + auto& generic_op_ptr = op_ptrs[0]; + + auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + + if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get())) + { + throw std::runtime_error( + "The generic kernel instance should be able to support any input shapes"); + }; + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::string best_op_name; From 559b02bcb660d81490ed75b68d8e7a14fc55c387 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 7 Jun 2023 16:32:38 +0000 Subject: [PATCH 12/14] Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization --- .../gemm_add_add_layernorm_naive.cpp | 19 +++++---- .../gpu/device_elementwise_instance.hpp | 41 ++++++++++++------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp index 1129dfa6b4d..58c91f903bc 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp @@ -172,18 +172,19 @@ int main() BLayout, CLayout>(); - const auto normalize_ptrs = - ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances< - CDataType, - ReduceDataType, - ReduceDataType, - GammaDataType, - BetaDataType, - LayerNormOutDataType>(); - std::cout << "found " << gemm_reduce_ptrs.size() << " gemm_reduceMean_reduceSquareMean instances" << std::endl; + using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + ck::tensor_operation::element_wise::Normalize, + 2>; + + const auto normalize_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + NormalizeDeviceOp>::GetInstances(); + std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl; auto f_matrix_space_size = diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp index 7e6267c87b4..b03693b00aa 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp @@ -5,11 +5,10 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" namespace ck { namespace tensor_operation { @@ -29,20 +28,34 @@ template -auto get_device_normalize_from_mean_meansquare_instances() +struct DeviceOperationInstanceFactory, + ck::Tuple, + Normalize, + 2>> { - std::vector op_ptrs; + using DeviceOp = DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + 2>; - if constexpr(is_same::value && is_same::value && - is_same::value && is_same::value && - is_same::value && is_same::value) + static auto GetInstances() { - ck::tensor_operation::device::instance:: - add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs); - } - - return op_ptrs; -} + std::vector> op_ptrs; + + if constexpr(is_same::value && is_same::value && + is_same::value && + is_same::value && + is_same::value && is_same::value) + { + ck::tensor_operation::device::instance:: + add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs); + } + + return op_ptrs; + }; +}; } // namespace instance } // namespace device From 20f7c630f7c64d5051a80068d8be086b9d82f633 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Thu, 8 Jun 2023 08:47:14 +0000 Subject: [PATCH 13/14] clang-format fix --- .../gpu/normalization/device_groupnorm_f16_instance.cpp | 3 ++- .../gpu/normalization/device_groupnorm_f32_instance.cpp | 3 ++- .../device_groupnorm_swish_f16_f32_f32_f16_instance.cpp | 4 ++-- .../gpu/normalization/device_groupnorm_swish_f16_instance.cpp | 3 ++- .../gpu/normalization/device_groupnorm_swish_f32_instance.cpp | 3 ++- .../gpu/normalization/device_layernorm2d_f16_instance.cpp | 3 ++- .../gpu/normalization/device_layernorm2d_f32_instance.cpp | 3 ++- .../gpu/normalization/device_layernorm4d_f16_instance.cpp | 3 ++- .../gpu/normalization/device_layernorm4d_f32_instance.cpp | 3 ++- 9 files changed, 18 insertions(+), 10 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp index 775fabaf081..e3820462cf8 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp index a76f085ae4b..d85817aad31 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp index 00b5101fdf3..a81f776c0f3 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp @@ -14,8 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, - device_normalization_f16_f32_f32_f16_generic_instance{}); + add_device_operation_instances( + instances, device_normalization_f16_f32_f32_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_f32_f32_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp index 736f8b304ce..f4bb8bda814 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp index 567305598e2..bbb9bd0fe8b 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp index ff2a13695fd..3f7e4aff1a2 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp index 62a8fa87da7..1f0db3a0366 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp index 16d0f1f098e..cb9d72e6142 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f16_generic_instance{}); add_device_operation_instances(instances, device_normalization_f16_instances{}); } diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp index 049fcd7769c..ed555b840da 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp @@ -14,7 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f32_generic_instance{}); + add_device_operation_instances(instances, + device_normalization_f32_generic_instance{}); add_device_operation_instances(instances, device_normalization_f32_instances{}); } From e0e2207dc7c97aec0f24290bdc98b8dfa74adc37 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Thu, 15 Jun 2023 10:00:43 +0000 Subject: [PATCH 14/14] Remove int8 from softmax instances --- .../tensor_operation_instance/gpu/softmax.hpp | 24 ---------- ...e_softmax_i8_i8_instance_rank3_reduce1.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank3_reduce2.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank3_reduce3.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank4_reduce1.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank4_reduce2.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank4_reduce3.hpp | 22 --------- ...e_softmax_i8_i8_instance_rank4_reduce4.hpp | 22 --------- .../device_softmax_i8_i8_instance_type.hpp | 46 ------------------- .../gpu/softmax/device_softmax_instance.hpp | 7 --- .../gpu/softmax/CMakeLists.txt | 7 --- ...e_softmax_i8_i8_instance_rank3_reduce1.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank3_reduce2.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank3_reduce3.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank4_reduce1.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank4_reduce2.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank4_reduce3.cpp | 26 ----------- ...e_softmax_i8_i8_instance_rank4_reduce4.cpp | 26 ----------- test/softmax/test_softmax_rank3.cpp | 4 +- test/softmax/test_softmax_rank4.cpp | 4 +- 20 files changed, 2 insertions(+), 426 deletions(-) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp index 3f82b5bfd86..26815f1447c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -89,30 +89,6 @@ struct DeviceOperationInstanceFactory && std::is_same_v && - std::is_same_v) - { - if constexpr(Rank == 3) - { - if constexpr(NumReduceDim == 1) - add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs); - else if constexpr(NumReduceDim == 2) - add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs); - else if constexpr(NumReduceDim == 3) - add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs); - } - else if constexpr(Rank == 4) - { - if constexpr(NumReduceDim == 1) - add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs); - else if constexpr(NumReduceDim == 2) - add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs); - else if constexpr(NumReduceDim == 3) - add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs); - else if constexpr(NumReduceDim == 4) - add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs); - } - } return op_ptrs; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp deleted file mode 100644 index e047bf606ab..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce1_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp deleted file mode 100644 index 6945a535ee2..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce2_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp deleted file mode 100644 index 54ef4932e4f..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce3_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp deleted file mode 100644 index 577485f21da..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce1_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp deleted file mode 100644 index 3db80207e38..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce2_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp deleted file mode 100644 index d076beda3d9..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce3_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp deleted file mode 100644 index 19b913d859a..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce4_instances( - std::vector>& instances); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp deleted file mode 100644 index aa4bf6be3b1..00000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -template -using device_softmax_i8_i8_instances = std::tuple< - // clang-format off - // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 64, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 16, 16>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 64, 1, 16, 16>, - // Reduction on middle dimensions - // InSrcVectorDim is 0 since we want to coalesce reads on M dimension - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 8, 0, 1, 1>, - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 32, 8, 32, 8, 0, 16, 8> - // clang-format on - >; - -template -using device_softmax_i8_i8_generic_instance = std::tuple< - // clang-format off - DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1> - // clang-format on - >; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp index fbd0437a2bc..10f99acb8d6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp @@ -17,10 +17,3 @@ #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt index 2a96a8570dc..202ad12b972 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt @@ -1,11 +1,4 @@ add_instance_library(device_softmax_instance - device_softmax_i8_i8_instance_rank3_reduce1.cpp - device_softmax_i8_i8_instance_rank3_reduce2.cpp - device_softmax_i8_i8_instance_rank3_reduce3.cpp - device_softmax_i8_i8_instance_rank4_reduce1.cpp - device_softmax_i8_i8_instance_rank4_reduce2.cpp - device_softmax_i8_i8_instance_rank4_reduce3.cpp - device_softmax_i8_i8_instance_rank4_reduce4.cpp device_softmax_f16_f16_instance_rank3_reduce1.cpp device_softmax_f16_f16_instance_rank3_reduce2.cpp device_softmax_f16_f16_instance_rank3_reduce3.cpp diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp deleted file mode 100644 index 1754d771faf..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce1_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 1>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp deleted file mode 100644 index f77d66d6e63..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce2_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 2>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp deleted file mode 100644 index 949d76ac68a..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank3_reduce3_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 3>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp deleted file mode 100644 index 43c2979854e..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce1_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 1>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp deleted file mode 100644 index 08ff41a7565..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce2_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 2>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp deleted file mode 100644 index 79850251636..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce3_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 3>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp deleted file mode 100644 index 77b120c7390..00000000000 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" -#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -void add_device_softmax_i8_i8_rank4_reduce4_instances( - std::vector>& instances) -{ - add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 4>{}); - add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp index 24ad912d8d7..43ae11bf1f2 100644 --- a/test/softmax/test_softmax_rank3.cpp +++ b/test/softmax/test_softmax_rank3.cpp @@ -13,7 +13,6 @@ using I = ck::Number; using F16 = ck::half_t; using F32 = float; -using I8 = int8_t; template class TestSoftmax : public ck::TestSoftmax @@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax using KernelTypes = ::testing::Types< // InDataType, AccDataType, OutDataType, Rank std::tuple< F16, F32, F16, I<3>>, - std::tuple< F32, F32, F32, I<3>>, - std::tuple< I8, F32, I8, I<3>> + std::tuple< F32, F32, F32, I<3>> >; // clang-format on diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp index b58301fb112..5cf96bbaa85 100644 --- a/test/softmax/test_softmax_rank4.cpp +++ b/test/softmax/test_softmax_rank4.cpp @@ -13,7 +13,6 @@ using I = ck::Number; using F16 = ck::half_t; using F32 = float; -using I8 = int8_t; template class TestSoftmax : public ck::TestSoftmax @@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax using KernelTypes = ::testing::Types< // InDataType, AccDataType, OutDataType, Rank std::tuple< F16, F32, F16, I<4>>, - std::tuple< F32, F32, F32, I<4>>, - std::tuple< I8, F32, I8, I<4>> + std::tuple< F32, F32, F32, I<4>> >; // clang-format on