From 240900b1c086f480e99f8cc2deab62c20709a2ff Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 30 May 2023 21:17:54 +0000
Subject: [PATCH 01/14] Add NumReduceDim template parameter to DeviceSoftmax
 and Softmax client API to simplify instances collecting

---
 client_example/06_softmax/softmax4d.cpp       |  14 +-
 .../gpu/device/device_softmax.hpp             |  17 +-
 .../gpu/device/impl/device_softmax_impl.hpp   |  17 +-
 .../tensor_operation_instance/gpu/softmax.hpp | 101 ++++++---
 .../device_softmax_f16_f16_instance.hpp       |  22 --
 ...softmax_f16_f16_instance_rank3_reduce1.hpp |   2 +-
 ...softmax_f16_f16_instance_rank3_reduce2.hpp |   2 +-
 ...softmax_f16_f16_instance_rank3_reduce3.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce1.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce2.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce3.hpp |   2 +-
 ...softmax_f16_f16_instance_rank4_reduce4.hpp |   2 +-
 .../device_softmax_f32_f32_instance.hpp       |  22 --
 ...softmax_f32_f32_instance_rank3_reduce1.hpp |   2 +-
 ...softmax_f32_f32_instance_rank3_reduce2.hpp |   2 +-
 ...softmax_f32_f32_instance_rank3_reduce3.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce1.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce2.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce3.hpp |   2 +-
 ...softmax_f32_f32_instance_rank4_reduce4.hpp |   2 +-
 .../softmax/device_softmax_i8_i8_instance.hpp |  22 --
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp |   2 +-
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp |   2 +-
 .../gpu/softmax/device_softmax_instance.hpp   |  24 ++-
 .../gpu/softmax/CMakeLists.txt                |   3 -
 .../device_softmax_f16_f16_instance.cpp       |  40 ----
 ...softmax_f16_f16_instance_rank3_reduce1.cpp |   6 +-
 ...softmax_f16_f16_instance_rank3_reduce2.cpp |   6 +-
 ...softmax_f16_f16_instance_rank3_reduce3.cpp |   6 +-
 ...softmax_f16_f16_instance_rank4_reduce1.cpp |   6 +-
 ...softmax_f16_f16_instance_rank4_reduce2.cpp |   6 +-
 ...softmax_f16_f16_instance_rank4_reduce3.cpp |   6 +-
 ...softmax_f16_f16_instance_rank4_reduce4.cpp |   6 +-
 .../device_softmax_f32_f32_instance.cpp       |  40 ----
 ...softmax_f32_f32_instance_rank3_reduce1.cpp |   6 +-
 ...softmax_f32_f32_instance_rank3_reduce2.cpp |   6 +-
 ...softmax_f32_f32_instance_rank3_reduce3.cpp |   6 +-
 ...softmax_f32_f32_instance_rank4_reduce1.cpp |   6 +-
 ...softmax_f32_f32_instance_rank4_reduce2.cpp |   6 +-
 ...softmax_f32_f32_instance_rank4_reduce3.cpp |   6 +-
 ...softmax_f32_f32_instance_rank4_reduce4.cpp |   6 +-
 .../softmax/device_softmax_i8_i8_instance.cpp |  40 ----
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp |   6 +-
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp |   6 +-
 .../include/profiler/profile_softmax_impl.hpp |  30 ++-
 profiler/src/profile_softmax.cpp              | 191 ++++++++++++++----
 test/softmax/test_softmax_util.hpp            |  88 +++++++-
 57 files changed, 441 insertions(+), 398 deletions(-)
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp

diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index e939ce8dfed..aef5624cadc 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -53,8 +53,13 @@ int main(int argc, char* argv[])
     SimpleDeviceMem in(sizeof(InDataType) * num_elements);
     SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
 
-    using DeviceOp = ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 Rank,
+                                                                 NumReduceDim>;
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
@@ -74,11 +79,6 @@ int main(int argc, char* argv[])
     {
         auto& op_ptr = op_ptrs[i];
 
-        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
-        {
-            continue;
-        }
-
         auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
                                                         in_strides,
                                                         reduce_dims,
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
index 94f788e5177..1ac746c3f46 100644
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -18,7 +18,8 @@ template <typename InDataType,
           typename OutDataType,
           typename InElementwiseOp,
           typename AccElementwiseOp,
-          index_t Rank>
+          index_t Rank,
+          index_t NumReduceDim>
 struct DeviceSoftmax : public BaseOperator
 {
     //
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
                         AccElementwiseOp acc_elementwise_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-    virtual index_t GetRank() const                           = 0;
-    virtual index_t GetNumReduceDim() const                   = 0;
 };
 
 template <typename InDataType,
@@ -58,9 +57,15 @@ template <typename InDataType,
           typename OutDataType,
           typename InElementwiseOp,
           typename AccElementwiseOp,
-          index_t Rank>
-using DeviceSoftmaxPtr = std::unique_ptr<
-    DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
+                                                       AccDataType,
+                                                       OutDataType,
+                                                       InElementwiseOp,
+                                                       AccElementwiseOp,
+                                                       Rank,
+                                                       NumReduceDim>>;
 
 } // namespace device
 } // namespace tensor_operation
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
index ed96b7340cf..fd2577913e2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                                                 OutDataType,
                                                 InElementwiseOp,
                                                 AccElementwiseOp,
-                                                Rank>
+                                                Rank,
+                                                NumReduceDim>
 {
-    static constexpr index_t kRank            = Rank;
-    static constexpr index_t kNumReduceDim    = NumReduceDim;
-    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
-
-    virtual index_t GetRank() const override { return kRank; }
-
-    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
-
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
 
     static constexpr index_t NumSrcDim = Rank;
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
     {
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(kNumInvariantDim == 0)
+            if constexpr(NumInvariantDim == 0)
             {
                 return false;
             }
             else
             {
-                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
                 {
                     return false;
                 }
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
         }
 
         // To improve
-        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
         {
             return false;
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index 36eb092f0f0..b4e816d822a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -9,34 +9,33 @@
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
-
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                                  AccDataType,
+                                                                                  OutDataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  Rank,
+                                                                                  NumReduceDim>>
 {
-    using DeviceOp =
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = DeviceSoftmax<InDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   PassThrough,
+                                   PassThrough,
+                                   Rank,
+                                   NumReduceDim>;
 
     static auto GetInstances()
     {
@@ -46,25 +45,73 @@ struct DeviceOperationInstanceFactory<
                      std::is_same_v<OutDataType, F16>)
         {
             if constexpr(Rank == 3)
-                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
+            }
             else if constexpr(Rank == 4)
-                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
+            }
         }
         else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
                           std::is_same_v<OutDataType, F32>)
         {
             if constexpr(Rank == 3)
-                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
+            }
             else if constexpr(Rank == 4)
-                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
+            }
         }
         else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
                           std::is_same_v<OutDataType, I8>)
         {
             if constexpr(Rank == 3)
-                add_device_softmax_i8_i8_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs);
+            }
             else if constexpr(Rank == 4)
-                add_device_softmax_i8_i8_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs);
+            }
         }
 
         return op_ptrs;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
deleted file mode 100644
index 83f52fc3ee7..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index 046ff578055..868d3b72122 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index 8e6a226f6a1..b6d422e7b8e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 518fa5f9867..88dce37b62e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index 10016cdd707..9e2783a9bd7 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index cdd5a3cd7b6..d7fc62ca9d4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index a8be272e020..f5f1143fb90 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index ec8296ff22f..85fbef53b76 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
deleted file mode 100644
index a6d9a359f46..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 6621a2c867a..4cd3db2ab97 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index 3dfac98ed8b..20cfbd43af2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index 6d2a0c93250..e3ad524762e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index 97dd3dcb18a..8a6f8b4206a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index 58f8760accc..f3a4e6b9887 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index df8d31f0da7..0721357f588 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index 1bd773227e1..a479be7c6ea 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
deleted file mode 100644
index f80f712ff5e..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
index 6f9952e7d58..5d9cbcee2be 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
index 2cbd13a1ba5..e2fb5190704 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
index 7b12522a859..10f5fb8918c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
index 54d477f80c5..82127e99279 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
index 4ffc44e3a92..b1a5caf404a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
index 08cbb81272f..654d33ca772 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
index 187d034b95a..4db2c687a6c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {
 
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);
 
 } // namespace instance
 } // namespace device
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
index 03be6e2bc7c..86233b9ce71 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -3,6 +3,24 @@
 
 #pragma once
 
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
index fc13261a6a7..2a96a8570dc 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_instance_library(device_softmax_instance
-    device_softmax_i8_i8_instance.cpp
     device_softmax_i8_i8_instance_rank3_reduce1.cpp
     device_softmax_i8_i8_instance_rank3_reduce2.cpp
     device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -7,7 +6,6 @@ add_instance_library(device_softmax_instance
     device_softmax_i8_i8_instance_rank4_reduce2.cpp
     device_softmax_i8_i8_instance_rank4_reduce3.cpp
     device_softmax_i8_i8_instance_rank4_reduce4.cpp
-    device_softmax_f16_f16_instance.cpp
     device_softmax_f16_f16_instance_rank3_reduce1.cpp
     device_softmax_f16_f16_instance_rank3_reduce2.cpp
     device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -15,7 +13,6 @@ add_instance_library(device_softmax_instance
     device_softmax_f16_f16_instance_rank4_reduce2.cpp
     device_softmax_f16_f16_instance_rank4_reduce3.cpp
     device_softmax_f16_f16_instance_rank4_reduce4.cpp
-    device_softmax_f32_f32_instance.cpp
     device_softmax_f32_f32_instance_rank3_reduce1.cpp
     device_softmax_f32_f32_instance_rank3_reduce2.cpp
     device_softmax_f32_f32_instance_rank3_reduce3.cpp
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
deleted file mode 100644
index 14d2764529c..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index fa334b997c2..3c7c5cb1291 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index 1c9d37d8483..2ce22a97730 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 5fbdab5055e..5ce03f02e2c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index 7dd8640b187..c020aa341d1 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index b32fe6838f8..0a3b0978a18 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index c05048ec567..cfa0375c09d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 6a235708bd4..679d6a6364d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
deleted file mode 100644
index e5bec5e2639..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 57d3f184a66..17dfbb54698 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index fae3a4dd666..03127397044 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index b6fb70e8e2a..cc9efe1c858 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index 33c7b6f35f3..a352082990d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index c22aa574b1f..ec1619a71d2 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index 55f3d2bd207..a0cf3d08587 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index fb0bcf5ee8a..e8b5bc87a5b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
deleted file mode 100644
index 608cfcf8380..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index 15552dbae5d..944e0c3bacd 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index 67674028860..24da3e58d23 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 4b33da93c2e..7febbf23534 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index fe3b823e889..08b56f01667 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index 8ecdf87d9fe..7f0ebceb82b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 35631352040..7145a8d91b0 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index aa21a0bf8a8..6118a84ab6d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
 }
 
 } // namespace instance
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
index 96816f53bbb..01305f264ef 100644
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
 
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
 bool profile_softmax_impl(int do_verification,
                           int init_method,
                           bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
     if(Rank != in_length.size())
     {
         throw std::runtime_error("Input tensor rank is different from template argument Rank!");
-    }
+    };
+
+    if(NumReduceDim != reduce_dims.size())
+    {
+        throw std::runtime_error(
+            "Input reduce_dims rank is different from template argument NumReduceDim!");
+    };
 
     Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                                : Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
 
     // add device softmax instances
     using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOp    = tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim>;
 
     // get device op instances
     const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
 
     for(auto& inst_ptr : instances)
     {
-        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
-        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-        {
-            continue;
-        }
-
         auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
                                                           in_tensor_strides,
                                                           reduce_dims,
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 78b64dda7d7..48a60a42c33 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
     {
         if(data_type == SoftmaxDataType::F16_F16)
         {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else if(data_type == SoftmaxDataType::F32_F32)
         {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else
         {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
     {
         if(data_type == SoftmaxDataType::F16_F16)
         {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else if(data_type == SoftmaxDataType::F32_F32)
         {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
         }
         else
         {
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 40b300cf992..673b31f6c10 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test
         int init_method = 1; // integer value initialization
         bool log        = false;
         std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
-        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
-            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
+        bool pass = false;
+
+        if constexpr(Rank == 3)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        }
+        else if constexpr(Rank == 4)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 4)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        };
+
         EXPECT_TRUE(pass);
     }
 

From de0bb3c26de393d56d17f64e2f064b2c6a7be84c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 30 May 2023 22:20:46 +0000
Subject: [PATCH 02/14] Move the generic kernel instance to be the first of the
 instance list for elementwise op of normalization

---
 .../gpu/elementwise/device_normalize_instance.cpp             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
index 182037f15c6..ff846d400a2 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -28,10 +28,10 @@ using Normalize = ck::tensor_operation::element_wise::Normalize;
 using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
     // clang-format off
     //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
+    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
-    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >,
-    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
+    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >
     // clang-format on
     >;
 

From a9f0d000eb9fd240404112a526ef125429a351df Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 31 May 2023 20:03:24 +0000
Subject: [PATCH 03/14] Add GetGenericInstance() interface for
 DeviceOperationInstanceFactory class of DeviceSoftmax

---
 .../add_device_operation_instance.hpp         | 14 ++++
 .../tensor_operation_instance/gpu/softmax.hpp | 80 +++++++++++++++++++
 ...softmax_f16_f16_instance_rank3_reduce1.hpp |  3 +
 ...softmax_f16_f16_instance_rank3_reduce2.hpp |  3 +
 ...softmax_f16_f16_instance_rank3_reduce3.hpp |  3 +
 ...softmax_f16_f16_instance_rank4_reduce1.hpp |  3 +
 ...softmax_f16_f16_instance_rank4_reduce2.hpp |  3 +
 ...softmax_f16_f16_instance_rank4_reduce3.hpp |  3 +
 ...softmax_f16_f16_instance_rank4_reduce4.hpp |  3 +
 ...softmax_f32_f32_instance_rank3_reduce1.hpp |  3 +
 ...softmax_f32_f32_instance_rank3_reduce2.hpp |  3 +
 ...softmax_f32_f32_instance_rank3_reduce3.hpp |  3 +
 ...softmax_f32_f32_instance_rank4_reduce1.hpp |  3 +
 ...softmax_f32_f32_instance_rank4_reduce2.hpp |  3 +
 ...softmax_f32_f32_instance_rank4_reduce3.hpp |  3 +
 ...softmax_f32_f32_instance_rank4_reduce4.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp |  3 +
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp |  3 +
 ...softmax_f16_f16_instance_rank3_reduce1.cpp |  6 ++
 ...softmax_f16_f16_instance_rank3_reduce2.cpp |  6 ++
 ...softmax_f16_f16_instance_rank3_reduce3.cpp |  6 ++
 ...softmax_f16_f16_instance_rank4_reduce1.cpp |  6 ++
 ...softmax_f16_f16_instance_rank4_reduce2.cpp |  6 ++
 ...softmax_f16_f16_instance_rank4_reduce3.cpp |  6 ++
 ...softmax_f16_f16_instance_rank4_reduce4.cpp |  6 ++
 ...softmax_f32_f32_instance_rank3_reduce1.cpp |  6 ++
 ...softmax_f32_f32_instance_rank3_reduce2.cpp |  6 ++
 ...softmax_f32_f32_instance_rank3_reduce3.cpp |  6 ++
 ...softmax_f32_f32_instance_rank4_reduce1.cpp |  6 ++
 ...softmax_f32_f32_instance_rank4_reduce2.cpp |  6 ++
 ...softmax_f32_f32_instance_rank4_reduce3.cpp |  6 ++
 ...softmax_f32_f32_instance_rank4_reduce4.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp |  6 ++
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp |  6 ++
 44 files changed, 283 insertions(+)

diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index 20df1b3616a..ab83fb373a8 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -29,6 +29,20 @@ void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_ins
     });
 }
 
+template <typename BaseOp, typename NewOpInstances>
+void get_first_device_operation_instance(std::unique_ptr<BaseOp>& op_instance,
+                                         const NewOpInstances& new_op_instances)
+{
+    const auto first_op_instance = std::get<0>(new_op_instances);
+
+    using FirstOpInstance = remove_cvref_t<decltype(first_op_instance)>;
+
+    static_assert(std::is_base_of_v<BaseOp, FirstOpInstance>,
+                  "wrong! FirstOpInstance should be derived from BaseOp");
+
+    op_instance = std::make_unique<FirstOpInstance>(first_op_instance);
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index b4e816d822a..6e1f58cb92e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -37,6 +37,86 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftma
                                    Rank,
                                    NumReduceDim>;
 
+    static auto GetGenericInstance()
+    {
+        std::unique_ptr<DeviceOp> op_ptr;
+
+        if constexpr(std::is_same_v<InDataType, F16> && std::is_same_v<AccDataType, F32> &&
+                     std::is_same_v<OutDataType, F16>)
+        {
+            if constexpr(Rank == 3)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_f16_f16_rank3_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_f16_f16_rank3_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_f16_f16_rank3_reduce3_generic_instance(op_ptr);
+            }
+            else if constexpr(Rank == 4)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_f16_f16_rank4_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_f16_f16_rank4_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_f16_f16_rank4_reduce3_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 4)
+                    get_device_softmax_f16_f16_rank4_reduce4_generic_instance(op_ptr);
+            }
+        }
+        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, F32>)
+        {
+            if constexpr(Rank == 3)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_f32_f32_rank3_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_f32_f32_rank3_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_f32_f32_rank3_reduce3_generic_instance(op_ptr);
+            }
+            else if constexpr(Rank == 4)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_f32_f32_rank4_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_f32_f32_rank4_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_f32_f32_rank4_reduce3_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 4)
+                    get_device_softmax_f32_f32_rank4_reduce4_generic_instance(op_ptr);
+            }
+        }
+        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, I8>)
+        {
+            if constexpr(Rank == 3)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_i8_i8_rank3_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_i8_i8_rank3_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_i8_i8_rank3_reduce3_generic_instance(op_ptr);
+            }
+            else if constexpr(Rank == 4)
+            {
+                if constexpr(NumReduceDim == 1)
+                    get_device_softmax_i8_i8_rank4_reduce1_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 2)
+                    get_device_softmax_i8_i8_rank4_reduce2_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 3)
+                    get_device_softmax_i8_i8_rank4_reduce3_generic_instance(op_ptr);
+                else if constexpr(NumReduceDim == 4)
+                    get_device_softmax_i8_i8_rank4_reduce4_generic_instance(op_ptr);
+            }
+        }
+
+        return op_ptr;
+    };
+
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index 868d3b72122..0f75242f1ca 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
 
+void get_device_softmax_f16_f16_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index b6d422e7b8e..80b7a83cdb6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
 
+void get_device_softmax_f16_f16_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 88dce37b62e..4721b56e6cc 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
 
+void get_device_softmax_f16_f16_rank3_reduce3_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index 9e2783a9bd7..fd9ca5e57c8 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
 
+void get_device_softmax_f16_f16_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index d7fc62ca9d4..6b03ab9595b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
 
+void get_device_softmax_f16_f16_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index f5f1143fb90..f62d3310f11 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
 
+void get_device_softmax_f16_f16_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index 85fbef53b76..17a60b32359 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
 
+void get_device_softmax_f16_f16_rank4_reduce4_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 4cd3db2ab97..3c8fae9a41d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
 
+void get_device_softmax_f32_f32_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index 20cfbd43af2..0138c7312e2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
 
+void get_device_softmax_f32_f32_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index e3ad524762e..063341cd69d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
 
+void get_device_softmax_f32_f32_rank3_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index 8a6f8b4206a..c740619986e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
 
+void get_device_softmax_f32_f32_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index f3a4e6b9887..89bb27b1974 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
 
+void get_device_softmax_f32_f32_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index 0721357f588..e1aada99ebf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
 
+void get_device_softmax_f32_f32_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index a479be7c6ea..253dde1404e 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
 
+void get_device_softmax_f32_f32_rank4_reduce4_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
index 5d9cbcee2be..0634d808ce5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);
 
+void get_device_softmax_i8_i8_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
index e2fb5190704..7fc05eafb27 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);
 
+void get_device_softmax_i8_i8_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
index 10f5fb8918c..1591a4740dd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);
 
+void get_device_softmax_i8_i8_rank3_reduce3_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
index 82127e99279..a0b94f84e29 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);
 
+void get_device_softmax_i8_i8_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
index b1a5caf404a..6cda277fb09 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);
 
+void get_device_softmax_i8_i8_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
index 654d33ca772..fefecf1f757 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);
 
+void get_device_softmax_i8_i8_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
index 4db2c687a6c..f9dc9046205 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -16,6 +16,9 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);
 
+void get_device_softmax_i8_i8_rank4_reduce4_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>& instance);
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index 3c7c5cb1291..407d791ad13 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }
 
+void get_device_softmax_f16_f16_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index 2ce22a97730..9582ec3decc 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }
 
+void get_device_softmax_f16_f16_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 5ce03f02e2c..eada467f4f9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }
 
+void get_device_softmax_f16_f16_rank3_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index c020aa341d1..b1eae23539e 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }
 
+void get_device_softmax_f16_f16_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index 0a3b0978a18..d35b97d04cb 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }
 
+void get_device_softmax_f16_f16_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index cfa0375c09d..24d9ebae0b5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }
 
+void get_device_softmax_f16_f16_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 679d6a6364d..373a96f6a8c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f16_f16_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }
 
+void get_device_softmax_f16_f16_rank4_reduce4_generic_instance(
+    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 4>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 17dfbb54698..cfe838c547c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 
+void get_device_softmax_f32_f32_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index 03127397044..0d2d6cb3ccd 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 
+void get_device_softmax_f32_f32_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index cc9efe1c858..315389966e8 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 
+void get_device_softmax_f32_f32_rank3_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index a352082990d..337b6b3aa28 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 
+void get_device_softmax_f32_f32_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index ec1619a71d2..8ccb0b739a5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 
+void get_device_softmax_f32_f32_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index a0cf3d08587..f8b12112f61 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 
+void get_device_softmax_f32_f32_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index e8b5bc87a5b..96dbd1cc3a2 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_f32_f32_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 
+void get_device_softmax_f32_f32_rank4_reduce4_generic_instance(
+    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 4>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index 944e0c3bacd..4f2799c1c11 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
 }
 
+void get_device_softmax_i8_i8_rank3_reduce1_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index 24da3e58d23..b2c592aab82 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
 }
 
+void get_device_softmax_i8_i8_rank3_reduce2_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 7febbf23534..3603fca7220 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
 }
 
+void get_device_softmax_i8_i8_rank3_reduce3_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index 08b56f01667..a0f2ca2f73a 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
 }
 
+void get_device_softmax_i8_i8_rank4_reduce1_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 1>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index 7f0ebceb82b..4788d6d12bf 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
 }
 
+void get_device_softmax_i8_i8_rank4_reduce2_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 2>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 7145a8d91b0..ba1f06629be 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
 }
 
+void get_device_softmax_i8_i8_rank4_reduce3_generic_instance(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 3>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index 6118a84ab6d..efaf9c02acc 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -19,6 +19,12 @@ void add_device_softmax_i8_i8_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
 }
 
+void add_device_softmax_i8_i8_rank4_reduce4_instances(
+    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>& instance)
+{
+    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 4>{});
+}
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

From f629cd9a93ce38dfed4886d849f3c38d2e5379c8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 31 May 2023 20:05:54 +0000
Subject: [PATCH 04/14] Add testing of GetGenericInstance() in client_example
 of Softmax

---
 client_example/06_softmax/softmax4d.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index aef5624cadc..401b161d116 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -6,6 +6,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
+#include <stdexcept>
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -60,6 +61,24 @@ int main(int argc, char* argv[])
                                                                  PassThrough,
                                                                  Rank,
                                                                  NumReduceDim>;
+
+    const auto g_op_ptr = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetGenericInstance();
+
+    auto g_op_argument_ptr = g_op_ptr->MakeArgumentPointer(in_lengths,
+                                                           in_strides,
+                                                           reduce_dims,
+                                                           alpha,
+                                                           beta,
+                                                           in.GetDeviceBuffer(),
+                                                           out.GetDeviceBuffer(),
+                                                           PassThrough{},
+                                                           PassThrough{});
+
+    if(!g_op_ptr->IsSupportedArgument(g_op_argument_ptr.get()))
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
@@ -122,6 +141,7 @@ int main(int argc, char* argv[])
               << best_op_name << std::endl;
 
     // run the best intance
+    if(found)
     {
         auto& op_ptr = op_ptrs[best_op_id];
         std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()

From 49a1c097c6d24472e694dcb02390459d349b20e0 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Jun 2023 18:43:51 +0000
Subject: [PATCH 05/14] Revert "Add testing of GetGenericInstance() in
 client_example of Softmax"

This reverts commit f629cd9a93ce38dfed4886d849f3c38d2e5379c8.
---
 client_example/06_softmax/softmax4d.cpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index 4f399b0e6a0..21c226c1ab7 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -6,7 +6,6 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#include <stdexcept>
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -61,24 +60,6 @@ int main(int argc, char* argv[])
                                                                  PassThrough,
                                                                  Rank,
                                                                  NumReduceDim>;
-
-    const auto g_op_ptr = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetGenericInstance();
-
-    auto g_op_argument_ptr = g_op_ptr->MakeArgumentPointer(in_lengths,
-                                                           in_strides,
-                                                           reduce_dims,
-                                                           alpha,
-                                                           beta,
-                                                           in.GetDeviceBuffer(),
-                                                           out.GetDeviceBuffer(),
-                                                           PassThrough{},
-                                                           PassThrough{});
-
-    if(!g_op_ptr->IsSupportedArgument(g_op_argument_ptr.get()))
-        throw std::runtime_error(
-            "Generic instance should be suitable for various input lengths/strides");
-
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
@@ -141,7 +122,6 @@ int main(int argc, char* argv[])
               << best_op_name << std::endl;
 
     // run the best intance
-    if(found)
     {
         auto& op_ptr = op_ptrs[best_op_id];
         std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()

From 3c0e60bcd1c8d4de3083e6bc7b0d93602c051d16 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 5 Jun 2023 18:44:18 +0000
Subject: [PATCH 06/14] Revert "Add GetGenericInstance() interface for
 DeviceOperationInstanceFactory class of DeviceSoftmax"

This reverts commit a9f0d000eb9fd240404112a526ef125429a351df.
---
 .../add_device_operation_instance.hpp         | 14 ----
 .../tensor_operation_instance/gpu/softmax.hpp | 80 -------------------
 ...softmax_f16_f16_instance_rank3_reduce1.hpp |  3 -
 ...softmax_f16_f16_instance_rank3_reduce2.hpp |  3 -
 ...softmax_f16_f16_instance_rank3_reduce3.hpp |  3 -
 ...softmax_f16_f16_instance_rank4_reduce1.hpp |  3 -
 ...softmax_f16_f16_instance_rank4_reduce2.hpp |  3 -
 ...softmax_f16_f16_instance_rank4_reduce3.hpp |  3 -
 ...softmax_f16_f16_instance_rank4_reduce4.hpp |  3 -
 ...softmax_f32_f32_instance_rank3_reduce1.hpp |  3 -
 ...softmax_f32_f32_instance_rank3_reduce2.hpp |  3 -
 ...softmax_f32_f32_instance_rank3_reduce3.hpp |  3 -
 ...softmax_f32_f32_instance_rank4_reduce1.hpp |  3 -
 ...softmax_f32_f32_instance_rank4_reduce2.hpp |  3 -
 ...softmax_f32_f32_instance_rank4_reduce3.hpp |  3 -
 ...softmax_f32_f32_instance_rank4_reduce4.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp |  3 -
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp |  3 -
 ...softmax_f16_f16_instance_rank3_reduce1.cpp |  6 --
 ...softmax_f16_f16_instance_rank3_reduce2.cpp |  6 --
 ...softmax_f16_f16_instance_rank3_reduce3.cpp |  6 --
 ...softmax_f16_f16_instance_rank4_reduce1.cpp |  6 --
 ...softmax_f16_f16_instance_rank4_reduce2.cpp |  6 --
 ...softmax_f16_f16_instance_rank4_reduce3.cpp |  6 --
 ...softmax_f16_f16_instance_rank4_reduce4.cpp |  6 --
 ...softmax_f32_f32_instance_rank3_reduce1.cpp |  6 --
 ...softmax_f32_f32_instance_rank3_reduce2.cpp |  6 --
 ...softmax_f32_f32_instance_rank3_reduce3.cpp |  6 --
 ...softmax_f32_f32_instance_rank4_reduce1.cpp |  6 --
 ...softmax_f32_f32_instance_rank4_reduce2.cpp |  6 --
 ...softmax_f32_f32_instance_rank4_reduce3.cpp |  6 --
 ...softmax_f32_f32_instance_rank4_reduce4.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp |  6 --
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp |  6 --
 44 files changed, 283 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
index 4a9132bc026..f57fed9c07c 100644
--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -29,20 +29,6 @@ void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_ins
     });
 }
 
-template <typename BaseOp, typename NewOpInstances>
-void get_first_device_operation_instance(std::unique_ptr<BaseOp>& op_instance,
-                                         const NewOpInstances& new_op_instances)
-{
-    const auto first_op_instance = std::get<0>(new_op_instances);
-
-    using FirstOpInstance = remove_cvref_t<decltype(first_op_instance)>;
-
-    static_assert(std::is_base_of_v<BaseOp, FirstOpInstance>,
-                  "wrong! FirstOpInstance should be derived from BaseOp");
-
-    op_instance = std::make_unique<FirstOpInstance>(first_op_instance);
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index 5a454cfbe24..3f82b5bfd86 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -37,86 +37,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftma
                                    Rank,
                                    NumReduceDim>;
 
-    static auto GetGenericInstance()
-    {
-        std::unique_ptr<DeviceOp> op_ptr;
-
-        if constexpr(std::is_same_v<InDataType, F16> && std::is_same_v<AccDataType, F32> &&
-                     std::is_same_v<OutDataType, F16>)
-        {
-            if constexpr(Rank == 3)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_f16_f16_rank3_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_f16_f16_rank3_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_f16_f16_rank3_reduce3_generic_instance(op_ptr);
-            }
-            else if constexpr(Rank == 4)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_f16_f16_rank4_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_f16_f16_rank4_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_f16_f16_rank4_reduce3_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 4)
-                    get_device_softmax_f16_f16_rank4_reduce4_generic_instance(op_ptr);
-            }
-        }
-        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
-                          std::is_same_v<OutDataType, F32>)
-        {
-            if constexpr(Rank == 3)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_f32_f32_rank3_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_f32_f32_rank3_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_f32_f32_rank3_reduce3_generic_instance(op_ptr);
-            }
-            else if constexpr(Rank == 4)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_f32_f32_rank4_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_f32_f32_rank4_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_f32_f32_rank4_reduce3_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 4)
-                    get_device_softmax_f32_f32_rank4_reduce4_generic_instance(op_ptr);
-            }
-        }
-        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
-                          std::is_same_v<OutDataType, I8>)
-        {
-            if constexpr(Rank == 3)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_i8_i8_rank3_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_i8_i8_rank3_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_i8_i8_rank3_reduce3_generic_instance(op_ptr);
-            }
-            else if constexpr(Rank == 4)
-            {
-                if constexpr(NumReduceDim == 1)
-                    get_device_softmax_i8_i8_rank4_reduce1_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 2)
-                    get_device_softmax_i8_i8_rank4_reduce2_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 3)
-                    get_device_softmax_i8_i8_rank4_reduce3_generic_instance(op_ptr);
-                else if constexpr(NumReduceDim == 4)
-                    get_device_softmax_i8_i8_rank4_reduce4_generic_instance(op_ptr);
-            }
-        }
-
-        return op_ptr;
-    };
-
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
index a5bedf47d9f..3fd2bd089ed 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
 
-void get_device_softmax_f16_f16_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
index 5d105706c58..210fdc0a585 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
 
-void get_device_softmax_f16_f16_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
index 20b6281b40d..894fb034d0d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
 
-void get_device_softmax_f16_f16_rank3_reduce3_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
index ffc40eefd20..708ef0ce130 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
 
-void get_device_softmax_f16_f16_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
index e435c5ffb80..6754e5ceffa 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
 
-void get_device_softmax_f16_f16_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
index 89ace5fa165..5e111176e19 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
 
-void get_device_softmax_f16_f16_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
index 5007b76c074..a3cecb32f83 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
 
-void get_device_softmax_f16_f16_rank4_reduce4_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
index 8f1e7491e02..4cc46902533 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
 
-void get_device_softmax_f32_f32_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
index a51593a6b9f..65724d7888a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
 
-void get_device_softmax_f32_f32_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
index 7f68f9c7d12..13bd45598ec 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
 
-void get_device_softmax_f32_f32_rank3_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
index a63608f2c49..d58b424ee94 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
 
-void get_device_softmax_f32_f32_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
index a93993c66a4..378e45eeb78 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
 
-void get_device_softmax_f32_f32_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
index 0a2cb69485f..293df08c7e9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
 
-void get_device_softmax_f32_f32_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
index 58d0fe112f1..e503a9fec1f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
 
-void get_device_softmax_f32_f32_rank4_reduce4_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
index e34db34b470..e047bf606ab 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);
 
-void get_device_softmax_i8_i8_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
index f6c666b9c7b..6945a535ee2 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);
 
-void get_device_softmax_i8_i8_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
index 5de51548dcb..54ef4932e4f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);
 
-void get_device_softmax_i8_i8_rank3_reduce3_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
index 767991073c1..577485f21da 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);
 
-void get_device_softmax_i8_i8_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
index 4192fe6cd80..3db80207e38 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);
 
-void get_device_softmax_i8_i8_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
index 555945856aa..d076beda3d9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);
 
-void get_device_softmax_i8_i8_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
index e5f87c73128..19b913d859a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -16,9 +16,6 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);
 
-void get_device_softmax_i8_i8_rank4_reduce4_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>& instance);
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index 69d965fed53..2f77da9efe0 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }
 
-void get_device_softmax_f16_f16_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index e09ec97a28f..b1c8c126b55 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }
 
-void get_device_softmax_f16_f16_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 0b13d934af7..898375567e2 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }
 
-void get_device_softmax_f16_f16_rank3_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<3, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index c50c11d6574..2ea196577cf 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }
 
-void get_device_softmax_f16_f16_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index b5ef88ea5b3..d373f918b81 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }
 
-void get_device_softmax_f16_f16_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index 2f386e87268..07fabead64d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }
 
-void get_device_softmax_f16_f16_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 5c7522505b5..36de53bd7c6 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f16_f16_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }
 
-void get_device_softmax_f16_f16_rank4_reduce4_generic_instance(
-    DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f16_f16_instances<4, 4>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 21b886f075e..4ebcfb7075d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 
-void get_device_softmax_f32_f32_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index bc5f44a6e0e..0c61d81c5b0 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 
-void get_device_softmax_f32_f32_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index d785c9f88c5..7670ecf80db 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 
-void get_device_softmax_f32_f32_rank3_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<3, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index 83b2e1437ea..fcdc9627194 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 
-void get_device_softmax_f32_f32_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index 9c3a2401feb..c2faac889e9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 
-void get_device_softmax_f32_f32_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index 8067d0d9ef1..ba11fb09084 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 
-void get_device_softmax_f32_f32_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index a4b9f0b2e97..b608ac449ae 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_f32_f32_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 
-void get_device_softmax_f32_f32_rank4_reduce4_generic_instance(
-    DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_f32_f32_instances<4, 4>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index 6118a252361..fe578366101 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
 }
 
-void get_device_softmax_i8_i8_rank3_reduce1_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index 3d630984724..c3f6b2f823c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
 }
 
-void get_device_softmax_i8_i8_rank3_reduce2_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 1e92994dffd..4b372626e5d 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank3_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
 }
 
-void get_device_softmax_i8_i8_rank3_reduce3_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<3, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index a0f5c1549e0..876bb5af874 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce1_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
 }
 
-void get_device_softmax_i8_i8_rank4_reduce1_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 1>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index c8e4c101197..1539d8a55e7 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce2_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
 }
 
-void get_device_softmax_i8_i8_rank4_reduce2_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 2>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 32e18c30644..1d59752b59e 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce3_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
 }
 
-void get_device_softmax_i8_i8_rank4_reduce3_generic_instance(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 3>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index 654878abd39..aecdfe542e4 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -19,12 +19,6 @@ void add_device_softmax_i8_i8_rank4_reduce4_instances(
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
 }
 
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>& instance)
-{
-    get_first_device_operation_instance(instance, device_softmax_i8_i8_instances<4, 4>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

From 43eb43b05e62f2a5e606f5bbe5ae3571d9787dbb Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Tue, 6 Jun 2023 20:18:10 +0000
Subject: [PATCH 07/14] Support generic kernel instance to be the first
 instance returned by GetInstances() for GroupNorm

---
 .../18_groupnorm/groupnorm_swish.cpp          | 24 +++++++++++++++++++
 .../device_groupnorm_f16_instance.cpp         |  1 +
 .../device_groupnorm_f32_instance.cpp         |  1 +
 ...oupnorm_swish_f16_f32_f32_f16_instance.cpp |  2 ++
 .../device_groupnorm_swish_f16_instance.cpp   |  1 +
 .../device_groupnorm_swish_f32_instance.cpp   |  1 +
 .../device_layernorm2d_f16_instance.cpp       |  1 +
 .../device_layernorm2d_f32_instance.cpp       |  1 +
 .../device_layernorm4d_f16_instance.cpp       |  1 +
 .../device_layernorm4d_f32_instance.cpp       |  1 +
 .../normalization_instance_common.hpp         | 21 ++++++++++++++++
 11 files changed, 55 insertions(+)

diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp
index 308061a3249..e1d198d2282 100644
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])
 
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
+    const auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr =
+        generic_op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                            xy_strides,         // xStrides
+                                            gamma_beta_strides, // gammaStrides
+                                            gamma_beta_strides, // betaStrides
+                                            xy_strides,         // yStrides
+                                            {1, 2, 4},          // reduceDims
+                                            1e-6,
+                                            x_device_buf.GetDeviceBuffer(),
+                                            gamma_device_buf.GetDeviceBuffer(),
+                                            beta_device_buf.GetDeviceBuffer(),
+                                            y_device_buf.GetDeviceBuffer(),
+                                            nullptr,
+                                            nullptr,
+                                            Swish{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
     std::string best_op_name;
     bool found            = false;
     int best_op_id        = -1;
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
index be860f58e06..775fabaf081 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
index 9a64e555d65..a76f085ae4b 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
index fe72a27331d..00b5101fdf3 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances,
                                    device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
index cac8641e135..736f8b304ce 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
index 0a9ac846235..567305598e2 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
index ad92818ec2f..ff2a13695fd 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_2_1_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
index 70e3bbc1c1d..62a8fa87da7 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_2_1_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
index 7c5d2c4a9c1..16d0f1f098e 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_4_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
index f5626d4a9a3..049fcd7769c 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -14,6 +14,7 @@ void add_device_normalization_rank_4_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
         instances)
 {
+    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
index d9029ac25e8..b0684962f9e 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
         // clang-format on
         >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_normalization_f32_instances = std::tuple<
     // clang-format off
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f32_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 template <typename OutElementwise, index_t Rank, index_t Reduce>
 using device_normalization_f16_f32_f32_f16_instances = std::tuple<
     // clang-format off
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple<
+    // clang-format off
+        DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

From ab11f5490a4255d185627c4d00b4d92d2d41e0c5 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Jun 2023 14:36:19 +0000
Subject: [PATCH 08/14] Move generic kernel instance to separate tuple for
 elementwise op of normalization

---
 .../gpu/elementwise/device_normalize_instance.cpp      | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
index 706d4bac317..f2a5f0728ac 100644
--- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -28,17 +28,25 @@ using Normalize = ck::tensor_operation::element_wise::Normalize;
 using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
     // clang-format off
     //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
-    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
     DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >
     // clang-format on
     >;
 
+using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple<
+    // clang-format off
+    DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
+    // clang-format on
+    >;
+
 void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
     std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
         instances)
 {
+    add_device_operation_instances(
+        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{});
+
     add_device_operation_instances(
         instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
 }

From 2b721d83de32fd10b42fa9ad366ef046e0c445ad Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Jun 2023 14:39:31 +0000
Subject: [PATCH 09/14] Remove un-used files for softmax instance

---
 .../device_softmax_f16_f16_instance.hpp       | 22 ----------
 .../device_softmax_f32_f32_instance.hpp       | 22 ----------
 .../softmax/device_softmax_i8_i8_instance.hpp | 22 ----------
 .../device_softmax_f16_f16_instance.cpp       | 40 -------------------
 .../device_softmax_f32_f32_instance.cpp       | 40 -------------------
 .../softmax/device_softmax_i8_i8_instance.cpp | 40 -------------------
 6 files changed, 186 deletions(-)
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
deleted file mode 100644
index 7c6f189cb99..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
deleted file mode 100644
index 41c67af7ade..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
deleted file mode 100644
index 3cd3742093f..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
deleted file mode 100644
index a86da7cc795..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
deleted file mode 100644
index ab8a69eec21..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
deleted file mode 100644
index 81a2ff80ca6..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck

From 53a032061c634ab3226871074f7dbaef4b114425 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Jun 2023 15:24:06 +0000
Subject: [PATCH 10/14] Store generic kernel instance to separate tuple for
 softmax

---
 .../gpu/softmax/device_softmax_f16_f16_instance_type.hpp | 8 +++++++-
 .../gpu/softmax/device_softmax_f32_f32_instance_type.hpp | 9 ++++++++-
 .../gpu/softmax/device_softmax_i8_i8_instance_type.hpp   | 8 +++++++-
 .../device_softmax_f16_f16_instance_rank3_reduce1.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank3_reduce2.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank3_reduce3.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank4_reduce1.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank4_reduce2.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank4_reduce3.cpp    | 1 +
 .../device_softmax_f16_f16_instance_rank4_reduce4.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank3_reduce1.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank3_reduce2.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank3_reduce3.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank4_reduce1.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank4_reduce2.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank4_reduce3.cpp    | 1 +
 .../device_softmax_f32_f32_instance_rank4_reduce4.cpp    | 1 +
 .../device_softmax_i8_i8_instance_rank3_reduce1.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank3_reduce2.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank3_reduce3.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank4_reduce1.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank4_reduce2.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank4_reduce3.cpp      | 1 +
 .../device_softmax_i8_i8_instance_rank4_reduce4.cpp      | 1 +
 24 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
index 53c142f6120..8c0782daa55 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
@@ -16,7 +16,6 @@ template <index_t Rank, index_t Reduce>
 using device_softmax_f16_f16_instances = std::tuple<
     // clang-format off
     //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    // fallback kernel
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,              1>,
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               8,              8>,
     DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               8,              8>,
@@ -33,6 +32,13 @@ using device_softmax_f16_f16_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_generic_instance = std::tuple<
+    // clang-format off
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,              1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
index a034e41a072..90c5ddc8a01 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
@@ -16,7 +16,7 @@ template <index_t Rank, index_t Reduce>
 using device_softmax_f32_f32_instances = std::tuple<
     // clang-format off
     //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>, // fallback kernel
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               4,               4>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               4,               4>,
     DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               4,               4>,
@@ -32,6 +32,13 @@ using device_softmax_f32_f32_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_generic_instance = std::tuple<
+    // clang-format off
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,               1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
index 6ff07de2360..aa4bf6be3b1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
@@ -17,7 +17,6 @@ template <index_t Rank, index_t Reduce>
 using device_softmax_i8_i8_instances = std::tuple<
     // clang-format off
     //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    // fallback kernel
     DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,               1,              1>,
     DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,              16,             16>,
     DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,               16,              1,              16,             16>,
@@ -34,6 +33,13 @@ using device_softmax_i8_i8_instances = std::tuple<
     // clang-format on
     >;
 
+template <index_t Rank, index_t Reduce>
+using device_softmax_i8_i8_generic_instance = std::tuple<
+    // clang-format off
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,              1>
+    // clang-format on
+    >;
+
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
index 2f77da9efe0..36867d993f9 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
index b1c8c126b55..373f33ad597 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
index 898375567e2..d26b92b4f49 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
index 2ea196577cf..bbb735b6fe5 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
index d373f918b81..92dbe677603 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
index 07fabead64d..354cda85d75 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
index 36de53bd7c6..edb5e42c103 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{});
     add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
index 4ebcfb7075d..566be8fc22c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
index 0c61d81c5b0..f9c76e3116c 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
index 7670ecf80db..541e0d71a93 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
index fcdc9627194..95a38df2834 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
index c2faac889e9..a29b88891d4 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
index ba11fb09084..0da46ea1b47 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
index b608ac449ae..fa217dc3f5b 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{});
     add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
index fe578366101..1754d771faf 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 1>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
index c3f6b2f823c..f77d66d6e63 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 2>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
index 4b372626e5d..949d76ac68a 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 3>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
index 876bb5af874..43c2979854e 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 1>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
index 1539d8a55e7..08ff41a7565 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 2>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
index 1d59752b59e..79850251636 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 3>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
index aecdfe542e4..77b120c7390 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -16,6 +16,7 @@ namespace instance {
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
     std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances)
 {
+    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 4>{});
     add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
 }
 

From 0fbff90826a7714c15d39c3ffb686b1ca69d588a Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Jun 2023 15:38:17 +0000
Subject: [PATCH 11/14] Add IsSupported checking for generic instance to client
 example of softmax

---
 client_example/06_softmax/softmax4d.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
index 21c226c1ab7..2ccad27a887 100644
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -64,6 +64,24 @@ int main(int argc, char* argv[])
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp>::GetInstances();
 
+    auto& generic_op_ptr = op_ptrs[0];
+
+    auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
+                                                                    in_strides,
+                                                                    reduce_dims,
+                                                                    alpha,
+                                                                    beta,
+                                                                    in.GetDeviceBuffer(),
+                                                                    out.GetDeviceBuffer(),
+                                                                    PassThrough{},
+                                                                    PassThrough{});
+
+    if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The generic kernel instance should be able to support any input shapes");
+    };
+
     std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
 
     std::string best_op_name;

From 559b02bcb660d81490ed75b68d8e7a14fc55c387 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 7 Jun 2023 16:32:38 +0000
Subject: [PATCH 12/14] Replace the
 get_device_normalize_from_mean_meansquare_instances() by the
 DeviceOperationInstanceFactory class for elementwise-normalization

---
 .../gemm_add_add_layernorm_naive.cpp          | 19 +++++----
 .../gpu/device_elementwise_instance.hpp       | 41 ++++++++++++-------
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
index 1129dfa6b4d..58c91f903bc 100644
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -172,18 +172,19 @@ int main()
             BLayout,
             CLayout>();
 
-    const auto normalize_ptrs =
-        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
-            CDataType,
-            ReduceDataType,
-            ReduceDataType,
-            GammaDataType,
-            BetaDataType,
-            LayerNormOutDataType>();
-
     std::cout << "found " << gemm_reduce_ptrs.size()
               << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
 
+    using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
+        ck::Tuple<LayerNormOutDataType>,
+        ck::tensor_operation::element_wise::Normalize,
+        2>;
+
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            NormalizeDeviceOp>::GetInstances();
+
     std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
 
     auto f_matrix_space_size =
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
index 7e6267c87b4..b03693b00aa 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -5,11 +5,10 @@
 
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -29,20 +28,34 @@ template <typename InputType,
           typename GammaDataType,
           typename BetaDataType,
           typename OutputType>
-auto get_device_normalize_from_mean_meansquare_instances()
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
+    ck::Tuple<OutputType>,
+    Normalize,
+    2>>
 {
-    std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
+    using DeviceOp = DeviceElementwise<
+        ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
+        ck::Tuple<OutputType>,
+        Normalize,
+        2>;
 
-    if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
-                 is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
-                 is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
+    static auto GetInstances()
     {
-        ck::tensor_operation::device::instance::
-            add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
-    }
-
-    return op_ptrs;
-}
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
+                     is_same<MeanSquareType, float>::value &&
+                     is_same<GammaDataType, half_t>::value &&
+                     is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    };
+};
 
 } // namespace instance
 } // namespace device

From 20f7c630f7c64d5051a80068d8be086b9d82f633 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 8 Jun 2023 08:47:14 +0000
Subject: [PATCH 13/14] clang-format fix

---
 .../gpu/normalization/device_groupnorm_f16_instance.cpp       | 3 ++-
 .../gpu/normalization/device_groupnorm_f32_instance.cpp       | 3 ++-
 .../device_groupnorm_swish_f16_f32_f32_f16_instance.cpp       | 4 ++--
 .../gpu/normalization/device_groupnorm_swish_f16_instance.cpp | 3 ++-
 .../gpu/normalization/device_groupnorm_swish_f32_instance.cpp | 3 ++-
 .../gpu/normalization/device_layernorm2d_f16_instance.cpp     | 3 ++-
 .../gpu/normalization/device_layernorm2d_f32_instance.cpp     | 3 ++-
 .../gpu/normalization/device_layernorm4d_f16_instance.cpp     | 3 ++-
 .../gpu/normalization/device_layernorm4d_f32_instance.cpp     | 3 ++-
 9 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
index 775fabaf081..e3820462cf8 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 5, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
index a76f085ae4b..d85817aad31 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 5, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
index 00b5101fdf3..a81f776c0f3 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
@@ -14,8 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances,
-                                   device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
+    add_device_operation_instances(
+        instances, device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances,
                                    device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
index 736f8b304ce..f4bb8bda814 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Swish, 5, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
index 567305598e2..bbb9bd0fe8b 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Swish, 5, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Swish, 5, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
index ff2a13695fd..3f7e4aff1a2 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 2, 1>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
index 62a8fa87da7..1f0db3a0366 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 2, 1>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 2, 1>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
index 16d0f1f098e..cb9d72e6142 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f16_generic_instance<Pass, 4, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f16_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
index 049fcd7769c..ed555b840da 100644
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp
@@ -14,7 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
     std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
         instances)
 {
-    add_device_operation_instances(instances, device_normalization_f32_generic_instance<Pass, 4, 3>{});
+    add_device_operation_instances(instances,
+                                   device_normalization_f32_generic_instance<Pass, 4, 3>{});
     add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
 }
 

From e0e2207dc7c97aec0f24290bdc98b8dfa74adc37 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 15 Jun 2023 10:00:43 +0000
Subject: [PATCH 14/14] Remove int8 from softmax instances

---
 .../tensor_operation_instance/gpu/softmax.hpp | 24 ----------
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp | 22 ---------
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp | 22 ---------
 .../device_softmax_i8_i8_instance_type.hpp    | 46 -------------------
 .../gpu/softmax/device_softmax_instance.hpp   |  7 ---
 .../gpu/softmax/CMakeLists.txt                |  7 ---
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp | 26 -----------
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp | 26 -----------
 test/softmax/test_softmax_rank3.cpp           |  4 +-
 test/softmax/test_softmax_rank4.cpp           |  4 +-
 20 files changed, 2 insertions(+), 426 deletions(-)
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
 delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
 delete mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp

diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
index 3f82b5bfd86..26815f1447c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -89,30 +89,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftma
                     add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
             }
         }
-        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
-                          std::is_same_v<OutDataType, I8>)
-        {
-            if constexpr(Rank == 3)
-            {
-                if constexpr(NumReduceDim == 1)
-                    add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs);
-                else if constexpr(NumReduceDim == 2)
-                    add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs);
-                else if constexpr(NumReduceDim == 3)
-                    add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs);
-            }
-            else if constexpr(Rank == 4)
-            {
-                if constexpr(NumReduceDim == 1)
-                    add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs);
-                else if constexpr(NumReduceDim == 2)
-                    add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs);
-                else if constexpr(NumReduceDim == 3)
-                    add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs);
-                else if constexpr(NumReduceDim == 4)
-                    add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs);
-            }
-        }
 
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
deleted file mode 100644
index e047bf606ab..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
deleted file mode 100644
index 6945a535ee2..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
deleted file mode 100644
index 54ef4932e4f..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
deleted file mode 100644
index 577485f21da..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
deleted file mode 100644
index 3db80207e38..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
deleted file mode 100644
index d076beda3d9..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
deleted file mode 100644
index 19b913d859a..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
deleted file mode 100644
index aa4bf6be3b1..00000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <tuple>
-
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
-#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-template <index_t Rank, index_t Reduce>
-using device_softmax_i8_i8_instances = std::tuple<
-    // clang-format off
-    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               64,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,              16,             16>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               64,              1,              16,             16>,
-    // Reduction on middle dimensions
-    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                8,              0,               1,              1>,
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                 32,                  8,               32,                8,              0,              16,              8>
-    // clang-format on
-    >;
-
-template <index_t Rank, index_t Reduce>
-using device_softmax_i8_i8_generic_instance = std::tuple<
-    // clang-format off
-    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,        64,                  8,                  8,                1,                1,              1,               1,              1>
-    // clang-format on
-    >;
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
index fbd0437a2bc..10f99acb8d6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -17,10 +17,3 @@
 #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
 #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
 #include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
index 2a96a8570dc..202ad12b972 100644
--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
@@ -1,11 +1,4 @@
 add_instance_library(device_softmax_instance
-    device_softmax_i8_i8_instance_rank3_reduce1.cpp
-    device_softmax_i8_i8_instance_rank3_reduce2.cpp
-    device_softmax_i8_i8_instance_rank3_reduce3.cpp
-    device_softmax_i8_i8_instance_rank4_reduce1.cpp
-    device_softmax_i8_i8_instance_rank4_reduce2.cpp
-    device_softmax_i8_i8_instance_rank4_reduce3.cpp
-    device_softmax_i8_i8_instance_rank4_reduce4.cpp
     device_softmax_f16_f16_instance_rank3_reduce1.cpp
     device_softmax_f16_f16_instance_rank3_reduce2.cpp
     device_softmax_f16_f16_instance_rank3_reduce3.cpp
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
deleted file mode 100644
index 1754d771faf..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 1>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
deleted file mode 100644
index f77d66d6e63..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 2>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
deleted file mode 100644
index 949d76ac68a..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<3, 3>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
deleted file mode 100644
index 43c2979854e..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 1>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
deleted file mode 100644
index 08ff41a7565..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 2>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
deleted file mode 100644
index 79850251636..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 3>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
deleted file mode 100644
index 77b120c7390..00000000000
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_generic_instance<4, 4>{});
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp
index 24ad912d8d7..43ae11bf1f2 100644
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
 
 using F16 = ck::half_t;
 using F32 = float;
-using I8  = int8_t;
 
 template <typename Tuple>
 class TestSoftmax : public ck::TestSoftmax<Tuple>
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 using KernelTypes = ::testing::Types<
     //         InDataType, AccDataType, OutDataType, Rank
     std::tuple<       F16,         F32,         F16,    I<3>>,
-    std::tuple<       F32,         F32,         F32,    I<3>>,
-    std::tuple<        I8,         F32,          I8,    I<3>>
+    std::tuple<       F32,         F32,         F32,    I<3>>
     >;
 // clang-format on
 
diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp
index b58301fb112..5cf96bbaa85 100644
--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
 
 using F16 = ck::half_t;
 using F32 = float;
-using I8  = int8_t;
 
 template <typename Tuple>
 class TestSoftmax : public ck::TestSoftmax<Tuple>
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 using KernelTypes = ::testing::Types<
     //         InDataType, AccDataType, OutDataType, Rank
     std::tuple<       F16,         F32,         F16,    I<4>>,
-    std::tuple<       F32,         F32,         F32,    I<4>>,
-    std::tuple<        I8,         F32,          I8,    I<4>>
+    std::tuple<       F32,         F32,         F32,    I<4>>
     >;
 // clang-format on