Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine xpu inference api #54342

Merged
merged 12 commits into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
with_time_step,
with_seq_lengths,
with_src_mask);
int quant_weight_bits =
Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1;
int quant_post_dynamic_weight_precision =
Has("quant_post_dynamic_weight_precision ")
? Get<int>("quant_post_dynamic_weight_precision ")
: -1;

int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Expand Down Expand Up @@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
w_node,
nullptr,
platform::errors::Fatal("w node should not be nullptr"));
if (quant_weight_bits == 8) {
if (quant_post_dynamic_weight_precision == 0) {
PrepareWeight<int8_t>(
graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
} else {
Expand Down
61 changes: 54 additions & 7 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,25 @@ struct Argument {
private: \
type__ field__##_;

#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__) \
public: \
type__& field__() { \
PADDLE_ENFORCE_EQ( \
Has(#field__), \
true, \
platform::errors::PreconditionNotMet("There is no such field")); \
return field__##_; \
} \
void Set##Field(type__ x) { \
field__##_ = x; \
valid_fields_.insert(#field__); \
} \
DECL_ARGUMENT_FIELD_VALID(field__); \
type__* field__##_ptr() { return &field__##_; } \
\
private: \
type__ field__##_;

#define DECL_ARGUMENT_FIELD_VALID(field__) \
bool field__##_valid() { return Has(#field__); }

Expand Down Expand Up @@ -276,20 +295,48 @@ struct Argument {
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);

DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits,
XpuQuantPostDynamicWeightBits,
// XpuConfig
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
XpuConvAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
XpuFcAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
XpuTransformerSoftmaxOptimizeLevel,
int);
DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
XpuTransformerEncoderAdaptiveSeqlen,
bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
XpuQuantPostStaticGeluOutThreshold,
float);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
XpuQuantPostDynamicActivationMethod,
int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
XpuQuantPostDynamicWeightPrecision,
int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
XpuQuantPostDynamicOpTypes,
std::vector<std::string>);
DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
XpuLiteEnableMultiStream,
bool);

DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);

Expand Down
53 changes: 36 additions & 17 deletions paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("enable_int8", new bool(lite_enable_int8));
pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level",
new int(argument->xpu_conv_autotune_level()));
pass->Set("xpu_conv_autotune_file",
new std::string(argument->xpu_conv_autotune_file()));
pass->Set("xpu_conv_autotune_file_writeback",
new bool(argument->xpu_conv_autotune_file_writeback()));
pass->Set("xpu_fc_autotune_level",
new int(argument->xpu_fc_autotune_level()));
pass->Set("xpu_fc_autotune_file",
new std::string(argument->xpu_fc_autotune_file()));
pass->Set("xpu_fc_autotune_file_writeback",
new bool(argument->xpu_fc_autotune_file_writeback()));
pass->Set("xpu_gemm_compute_precision",
new int(argument->xpu_gemm_compute_precision()));
pass->Set("xpu_transformer_softmax_optimize_level",
new int(argument->xpu_transformer_softmax_optimize_level()));
pass->Set("xpu_transformer_encoder_adaptive_seqlen",
new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
pass->Set(
"xpu_quant_post_static_gelu_out_threshold",
new float(argument->xpu_quant_post_static_gelu_out_threshold()));
pass->Set("xpu_quant_post_dynamic_activation_method",
new int(argument->xpu_quant_post_dynamic_activation_method()));
pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
pass->Set("xpu_enable_multi_stream",
new bool(argument->xpu_lite_enable_multi_stream()));
pass->Set("use_opencl", new bool(argument->use_opencl()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
pass->Set("locked", new bool(argument->xpu_locked()));
pass->Set("autotune", new bool(argument->xpu_autotune()));
pass->Set("autotune_file",
new std::string(argument->xpu_autotune_file()));
pass->Set("precision", new std::string(argument->xpu_precision()));
pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("enable_multi_stream",
new bool(argument->xpu_enable_multi_stream()));
// NNAdapter Related
pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
pass->Set("nnadapter_model_cache_dir",
Expand Down Expand Up @@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
pass->Set("use_fc_padding", new bool(use_fc_padding));
} else if (pass_name == "fused_multi_transformer_xpu_pass") {
auto op_types = argument->xpu_quant_post_dynamic_op_types();
if (std::count(op_types.begin(),
op_types.end(),
"fused_multi_transformer") > 0) {
pass->Set("quant_weight_bits",
new int(argument->xpu_quant_post_dynamic_weight_bits()));
int quant_post_dynamic_weight_precision =
argument->xpu_quant_post_dynamic_weight_precision();
if (quant_post_dynamic_weight_precision == 0) {
pass->Set("quant_post_dynamic_weight_precision ", new int(0));
}
}
pre_pass = pass_name;
Expand Down
43 changes: 27 additions & 16 deletions paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(

bool use_gpu = Get<bool>("use_gpu");
bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
bool use_opencl = Get<bool>("use_opencl");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
bool locked = Get<bool>("locked");
bool autotune = Get<bool>("autotune");
std::string autotune_file = Get<std::string>("autotune_file");
std::string precision = Get<std::string>("precision");
bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
bool enable_multi_stream = Get<bool>("enable_multi_stream");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
std::string xpu_conv_autotune_file =
Get<std::string>("xpu_conv_autotune_file");
int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
std::string xpu_transformer_encoder_precision{"int16"};
if (xpu_gemm_compute_precision == 0) {
xpu_transformer_encoder_precision = "int8";
} else if (xpu_gemm_compute_precision == 1) {
xpu_transformer_encoder_precision = "int16";
} else if (xpu_gemm_compute_precision == 2) {
xpu_transformer_encoder_precision = "int31";
}
bool xpu_transformer_encoder_adaptive_seqlen =
Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
// NNAdapter Related
bool use_nnadapter = Get<bool>("use_nnadapter");
std::string nnadapter_model_cache_dir =
Expand Down Expand Up @@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
}

config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
config.xpu_l3_size = xpu_l3_size;
config.device_id = xpu_device_id;
config.locked = locked;
config.autotune = autotune;
config.autotune_file = autotune_file;
config.precision = precision;
config.adaptive_seqlen = adaptive_seqlen;
config.enable_multi_stream = enable_multi_stream;
config.xpu_l3_locked = xpu_l3_locked;
config.xpu_conv_autotune = xpu_conv_autotune;
config.xpu_conv_autotune_file = xpu_conv_autotune_file;
config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
config.xpu_transformer_encoder_adaptive_seqlen =
xpu_transformer_encoder_adaptive_seqlen;
config.xpu_enable_multi_stream = xpu_enable_multi_stream;
// NNAdapter Related
config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
config.nnadapter_device_names = nnadapter_device_names;
Expand Down
Loading