PaddlePaddle · zhupengyang · Jun 9, 2023 · May 19, 2023 · May 23, 2023 · May 24, 2023
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
                                                  with_time_step,
                                                  with_seq_lengths,
                                                  with_src_mask);
-  int quant_weight_bits =
-      Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1;
+  int quant_post_dynamic_weight_precision =
+      Has("quant_post_dynamic_weight_precision ")
+          ? Get<int>("quant_post_dynamic_weight_precision ")
+          : -1;
 
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
             w_node,
             nullptr,
             platform::errors::Fatal("w node should not be nullptr"));
-        if (quant_weight_bits == 8) {
+        if (quant_post_dynamic_weight_precision == 0) {
           PrepareWeight<int8_t>(
               graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
         } else {

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -93,6 +93,25 @@ struct Argument {
  private:                                                                \
   type__ field__##_;
 
+#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__)              \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__),                                                   \
+        true,                                                            \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(type__ x) {                                            \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
+  type__ field__##_;
+
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
   bool field__##_valid() { return Has(#field__); }
 
@@ -276,20 +295,48 @@ struct Argument {
   DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
-  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
   DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
   DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
-  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
-  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
   DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
-  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits,
-                      XpuQuantPostDynamicWeightBits,
+  // XpuConfig
+  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
+  DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
+  DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
+                      XpuConvAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
+                      XpuFcAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
+                      XpuTransformerSoftmaxOptimizeLevel,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
+                      XpuTransformerEncoderAdaptiveSeqlen,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
+                      XpuQuantPostStaticGeluOutThreshold,
+                      float);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
+                      XpuQuantPostDynamicActivationMethod,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
+                      XpuQuantPostDynamicWeightPrecision,
                       int);
   DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
                       XpuQuantPostDynamicOpTypes,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
+  DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
+                      XpuLiteEnableMultiStream,
+                      bool);
 
   DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);
 

diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("enable_int8", new bool(lite_enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
-      pass->Set("xpu_l3_workspace_size",
-                new int(argument->xpu_l3_workspace_size()));
+      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
+      pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
+      pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
+      pass->Set("xpu_l3_autotune_size",
+                new size_t(argument->xpu_l3_autotune_size()));
+      pass->Set("xpu_stream", new void *(argument->xpu_stream()));
+      pass->Set("xpu_conv_autotune_level",
+                new int(argument->xpu_conv_autotune_level()));
+      pass->Set("xpu_conv_autotune_file",
+                new std::string(argument->xpu_conv_autotune_file()));
+      pass->Set("xpu_conv_autotune_file_writeback",
+                new bool(argument->xpu_conv_autotune_file_writeback()));
+      pass->Set("xpu_fc_autotune_level",
+                new int(argument->xpu_fc_autotune_level()));
+      pass->Set("xpu_fc_autotune_file",
+                new std::string(argument->xpu_fc_autotune_file()));
+      pass->Set("xpu_fc_autotune_file_writeback",
+                new bool(argument->xpu_fc_autotune_file_writeback()));
+      pass->Set("xpu_gemm_compute_precision",
+                new int(argument->xpu_gemm_compute_precision()));
+      pass->Set("xpu_transformer_softmax_optimize_level",
+                new int(argument->xpu_transformer_softmax_optimize_level()));
+      pass->Set("xpu_transformer_encoder_adaptive_seqlen",
+                new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
+      pass->Set(
+          "xpu_quant_post_static_gelu_out_threshold",
+          new float(argument->xpu_quant_post_static_gelu_out_threshold()));
+      pass->Set("xpu_quant_post_dynamic_activation_method",
+                new int(argument->xpu_quant_post_dynamic_activation_method()));
+      pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
+      pass->Set("xpu_enable_multi_stream",
+                new bool(argument->xpu_lite_enable_multi_stream()));
       pass->Set("use_opencl", new bool(argument->use_opencl()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
-      pass->Set("locked", new bool(argument->xpu_locked()));
-      pass->Set("autotune", new bool(argument->xpu_autotune()));
-      pass->Set("autotune_file",
-                new std::string(argument->xpu_autotune_file()));
-      pass->Set("precision", new std::string(argument->xpu_precision()));
-      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
-      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
-      pass->Set("enable_multi_stream",
-                new bool(argument->xpu_enable_multi_stream()));
       // NNAdapter Related
       pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
       pass->Set("nnadapter_model_cache_dir",
@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     } else if (pass_name == "fused_multi_transformer_xpu_pass") {
-      auto op_types = argument->xpu_quant_post_dynamic_op_types();
-      if (std::count(op_types.begin(),
-                     op_types.end(),
-                     "fused_multi_transformer") > 0) {
-        pass->Set("quant_weight_bits",
-                  new int(argument->xpu_quant_post_dynamic_weight_bits()));
+      int quant_post_dynamic_weight_precision =
+          argument->xpu_quant_post_dynamic_weight_precision();
+      if (quant_post_dynamic_weight_precision == 0) {
+        pass->Set("quant_post_dynamic_weight_precision ", new int(0));
       }
     }
     pre_pass = pass_name;

diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  bool use_xpu = Get<bool>("use_xpu");
-  int xpu_device_id = Get<int>("xpu_device_id");
-  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   bool use_opencl = Get<bool>("use_opencl");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
-  bool locked = Get<bool>("locked");
-  bool autotune = Get<bool>("autotune");
-  std::string autotune_file = Get<std::string>("autotune_file");
-  std::string precision = Get<std::string>("precision");
-  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
-  bool enable_multi_stream = Get<bool>("enable_multi_stream");
+  bool use_xpu = Get<bool>("use_xpu");
+  int xpu_device_id = Get<int>("xpu_device_id");
+  size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
+  bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
+  bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
+  std::string xpu_conv_autotune_file =
+      Get<std::string>("xpu_conv_autotune_file");
+  int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
+  std::string xpu_transformer_encoder_precision{"int16"};
+  if (xpu_gemm_compute_precision == 0) {
+    xpu_transformer_encoder_precision = "int8";
+  } else if (xpu_gemm_compute_precision == 1) {
+    xpu_transformer_encoder_precision = "int16";
+  } else if (xpu_gemm_compute_precision == 2) {
+    xpu_transformer_encoder_precision = "int31";
+  }
+  bool xpu_transformer_encoder_adaptive_seqlen =
+      Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
+  bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
   // NNAdapter Related
   bool use_nnadapter = Get<bool>("use_nnadapter");
   std::string nnadapter_model_cache_dir =
@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
   }
 
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
-  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.xpu_l3_size = xpu_l3_size;
   config.device_id = xpu_device_id;
-  config.locked = locked;
-  config.autotune = autotune;
-  config.autotune_file = autotune_file;
-  config.precision = precision;
-  config.adaptive_seqlen = adaptive_seqlen;
-  config.enable_multi_stream = enable_multi_stream;
+  config.xpu_l3_locked = xpu_l3_locked;
+  config.xpu_conv_autotune = xpu_conv_autotune;
+  config.xpu_conv_autotune_file = xpu_conv_autotune_file;
+  config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
+  config.xpu_transformer_encoder_adaptive_seqlen =
+      xpu_transformer_encoder_adaptive_seqlen;
+  config.xpu_enable_multi_stream = xpu_enable_multi_stream;
   // NNAdapter Related
   config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
   config.nnadapter_device_names = nnadapter_device_names;