Skip to content

Commit

Permalink
Fused_mt Branch Migration (#64125)
Browse files Browse the repository at this point in the history
* Merge fused_mt branch

* Adjusted fuse_mt_int8

* Revert attention_layer_norm.h

* Revert paddle/phi/kernels/fusion/gpu/fmha_ref.h

* Add win support and refine format.

* Reformat for win.

* Removed redundant files, now only supports flash_attn_v2 and variable length

* Refine static_fused_ft test

* Refine fused_mt related testcase

* Remove custom_adll_reduce

* Remove operator cublaslt and revert parallel test

* Refine empty seq_len

* Refine ft

* Refine ft_static test

* Remove float32 support and static parallel ft test

* Refine type static error.

* Fix doc type error

* Fuse_mt code format

* Remove some redundant code

* Remove redundant attention_layer_norm.h

* Remove redundant code in ft_op

* Remove Redundant code and skip fuse_mt doctest

* Remove redundant fmha_ref mmha_util and other code

* Remove redundant kernel

* Remove redundant file

* Refine fuse_mt code

* Refine cublaslt comment
  • Loading branch information
penPenf28 committed May 23, 2024
1 parent 5ccfdff commit f8f9bfa
Show file tree
Hide file tree
Showing 36 changed files with 3,695 additions and 3,098 deletions.
4 changes: 4 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/utils/string/printf.h"

#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
Expand All @@ -45,6 +47,8 @@
#include "paddle/pir/include/core/program.h"

namespace paddle_infer {
using float16 = paddle::platform::float16;
using bfloat16 = phi::dtype::bfloat16;
namespace experimental {
class InternalUtils;
};
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::BFLOAT16:
return sizeof(uint16_t);
case PaddleDType::INT64:
return sizeof(int64_t);
case PaddleDType::INT32:
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
input_ptr = input.mutable_data<bfloat16>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
#endif

namespace experimental {
Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/operators/fused/attn_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ limitations under the License. */

#include <iostream>
#include <vector>
#include "paddle/fluid/operators/fused/cublaslt.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/cublaslt.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"

namespace paddle {
Expand All @@ -35,7 +35,8 @@ class AttnMatmulINT8 {
AttnMatmulINT8(
const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
: dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
auto helper = std::make_shared<CublasLtHelper>(m, k, n);
auto helper = std::make_shared<phi::CublasLtHelper>(
m, k, n, dev_ctx.cublaslt_handle());
helpers_.emplace_back(helper);
gpu_config_ = std::make_unique<GpuLaunchConfig>(
phi::backends::gpu::GetGpuLaunchConfig1D(
Expand Down Expand Up @@ -186,7 +187,7 @@ class AttnMatmulINT8 {
int k_; // k

int compute_bias_;
std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
std::vector<std::shared_ptr<phi::CublasLtHelper>> helpers_;
std::unique_ptr<GpuLaunchConfig> gpu_config_;
};

Expand Down
Loading

0 comments on commit f8f9bfa

Please sign in to comment.