Fused_mt Branch Migration (#64125)

* Merge fused_mt branch * Adjusted fuse_mt_int8 * Revert attention_layer_norm.h * Revert paddle/phi/kernels/fusion/gpu/fmha_ref.h * Add win support and refine format. * Reformat for win. * Removed redundant files, now only supports flash_attn_v2 and variable length * Refine static_fused_ft test * Refine fused_mt related testcase * Remove custom_adll_reduce * Remove operator cublaslt and revert parallel test * Refine empty seq_len * Refine ft * Refine ft_static test * Remove float32 support and static parallel ft test * Refine type static error. * Fix doc type error * Fuse_mt code format * Remove some redundant code * Remove redundant attention_layer_norm.h * Remove redundant code in ft_op * Remove Redundant code and skip fuse_mt doctest * Remove redundant fmha_ref mmha_util and other code * Remove redundant kernel * Remove redundant file * Refine fuse_mt code * Refine cublaslt comment
PaddlePaddle · May 23, 2024 · f8f9bfa · f8f9bfa
1 parent 5ccfdff
commit f8f9bfa
Show file tree

Hide file tree

Showing 36 changed files with 3,695 additions and 3,098 deletions.
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -29,6 +29,8 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/utils/string/printf.h"
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -45,6 +47,8 @@
 #include "paddle/pir/include/core/program.h"
 
 namespace paddle_infer {
+using float16 = paddle::platform::float16;
+using bfloat16 = phi::dtype::bfloat16;
 namespace experimental {
 class InternalUtils;
 };

diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
@@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
   switch (dtype) {
     case PaddleDType::FLOAT32:
       return sizeof(float);
+    case PaddleDType::BFLOAT16:
+      return sizeof(uint16_t);
     case PaddleDType::INT64:
       return sizeof(int64_t);
     case PaddleDType::INT32:

diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
@@ -221,6 +221,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       input_ptr = input.mutable_data<float>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::INT32) {
       input_ptr = input.mutable_data<int32_t>(ddim, place_);
+    } else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
+      input_ptr = input.mutable_data<bfloat16>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;

diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -826,6 +826,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
 template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
 template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
 template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
 #endif
 
 namespace experimental {

diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include <iostream>
 #include <vector>
-#include "paddle/fluid/operators/fused/cublaslt.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/cublaslt.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
@@ -35,7 +35,8 @@ class AttnMatmulINT8 {
   AttnMatmulINT8(
       const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
       : dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
-    auto helper = std::make_shared<CublasLtHelper>(m, k, n);
+    auto helper = std::make_shared<phi::CublasLtHelper>(
+        m, k, n, dev_ctx.cublaslt_handle());
     helpers_.emplace_back(helper);
     gpu_config_ = std::make_unique<GpuLaunchConfig>(
         phi::backends::gpu::GetGpuLaunchConfig1D(
@@ -186,7 +187,7 @@ class AttnMatmulINT8 {
   int k_;  // k
 
   int compute_bias_;
-  std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
+  std::vector<std::shared_ptr<phi::CublasLtHelper>> helpers_;
   std::unique_ptr<GpuLaunchConfig> gpu_config_;
 };