Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature dynamic quant fc #1660

Merged
merged 2 commits into from
May 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions source/tnn/core/layer_type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ static std::map<std::string, LayerType> global_layer_type_map = {
{"DynamicRangeQuantizedConvolution", LAYER_CONVOLUTION},
{"DynamicRangeQuantizedLSTMONNX", LAYER_LSTMONNX},
{"DynamicRangeQuantizedMatMul", LAYER_MATMUL},
{"DynamicRangeQuantizedInnerProduct", LAYER_INNER_PRODUCT},
};

LayerType GlobalConvertLayerType(std::string layer_type_str) {
Expand Down
14 changes: 12 additions & 2 deletions source/tnn/device/arm/acc/compute_arm82/gemm_function_fp16.cc
Original file line number Diff line number Diff line change
Expand Up @@ -506,12 +506,22 @@ void Kernel_1x16(int m, int n, int k, const fp16_t *sa, const fp16_t *sb, fp16_t
Half8 vec_0 = Half8(fp16_t(0));
Half8 c0 = vec_0;
Half8 c1 = vec_0;
// Kahan summation
// set to zero
Half8 error_0 = vec_0;
Half8 error_1 = vec_0;
for (int kk = 0; kk < k; ++kk) {
Half8 b0 = Half8::load(b);
Half8 b1 = Half8::load(b + 8);
Half8 a0 = Half8(a[kk]);
Half8::mla(c0, a0, b0);
Half8::mla(c1, a0, b1);
Half8 y_0 = a0 * b0 - error_0;
Half8 y_1 = a0 * b1 - error_1;
Half8 t_0 = c0 + y_0;
Half8 t_1 = c1 + y_1;
error_0 = (t_0 - c0) - y_0;
error_1 = (t_1 - c1) - y_1;
c0 = t_0;
c1 = t_1;
b += 16;
}
if (remain > 8) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ Status InnerProductLayerInterpreter::SaveResource(Serializer& serializer, LayerP
serializer.PutRaw(layer_res->zero_point_handle);
serializer.PutRaw(layer_res->scale_handle);
}

if (layer_param->dynamic_range_quantized) {
// now dynamic range quantization is to use symmetric quantization, only save scale
serializer.PutRaw(layer_res->scale_handle);
}
return TNN_OK;
}

Expand Down
34 changes: 34 additions & 0 deletions source/tnn/optimizer/net_optimizer_dynamic_range_dequant.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ namespace optimizer {
case LAYER_MATMUL:
DequantMatMul(layer, structure, resource);
break;
case LAYER_INNER_PRODUCT:
DequantInnerProduct(layer, structure, resource);
break;
default:
break;
}
Expand Down Expand Up @@ -173,6 +176,37 @@ namespace optimizer {
return TNN_OK;
}

Status NetOptimizerDynamicRangeDequant::DequantInnerProduct(std::shared_ptr<LayerInfo> &layer,
NetStructure *structure, NetResource *resource) {
auto layer_name = layer->name;
auto matmul_resource = std::dynamic_pointer_cast<InnerProductLayerResource>(resource->resource_map[layer_name]);
auto scale_handle = matmul_resource->scale_handle;
if (matmul_resource->weight_handle.GetDataType() != DATA_TYPE_INT8) {
LOGD(
"Dynamic range dequantize layer(%s) weight data type is not int8_t."
"This weight might have been dequantized before.\n",
layer_name.c_str());
return TNN_OK;
}

const int data_size = matmul_resource->weight_handle.GetDataCount();
auto weight_ptr = matmul_resource->weight_handle.force_to<int8_t *>();
auto scale_value = scale_handle.force_to<float *>()[0];
std::vector<float> weight_data(data_size, 0);
for (int i = 0; i < data_size; i++) {
weight_data[i] = scale_value * (float)(weight_ptr[i]);
}

RawBuffer weight_buf(data_size * sizeof(float));
memcpy(weight_buf.force_to<float *>(), weight_data.data(), data_size * sizeof(float));
weight_buf.SetDataType(DATA_TYPE_FLOAT);
weight_buf.SetBufferDims(matmul_resource->weight_handle.GetBufferDims());

matmul_resource->weight_handle = weight_buf;
layer->param->dynamic_range_quantized = false;
return TNN_OK;
}

} // namespace optimizer

} // namespace TNN_NS
1 change: 1 addition & 0 deletions source/tnn/optimizer/net_optimizer_dynamic_range_dequant.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ namespace optimizer {
Status DequantConv(std::shared_ptr<LayerInfo> &layer, NetStructure *structure, NetResource *resource);
Status DequantLSTM(std::shared_ptr<LayerInfo> &layer, NetStructure *structure, NetResource *resource);
Status DequantMatMul(std::shared_ptr<LayerInfo> &layer, NetStructure *structure, NetResource *resource);
Status DequantInnerProduct(std::shared_ptr<LayerInfo> &layer, NetStructure *structure, NetResource *resource);
};

} // namespace optimizer
Expand Down
22 changes: 22 additions & 0 deletions tools/dynamic_range_quantization/dynamic_range_quantization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ Status DynamicRangeQuantizer::GetDynamicRangeQuantModel(std::shared_ptr<NetStruc
case LAYER_MATMUL:
QuantMatMul(layer, resource_map, constant_map);
break;
case LAYER_INNER_PRODUCT:
QuantInnerProduct(layer, resource_map, constant_map);
break;
default:
break;
}
Expand Down Expand Up @@ -197,4 +200,23 @@ float DynamicRangeQuantizer::GetAbsMax(float* data, int data_size) {

return max_value;
}

Status DynamicRangeQuantizer::QuantInnerProduct(std::shared_ptr<LayerInfo>& layer,
std::map<std::string, std::shared_ptr<LayerResource>>& resource_map,
std::map<std::string, std::shared_ptr<RawBuffer>>& constant_map) {
auto matmul_param = std::dynamic_pointer_cast<InnerProductLayerParam>(layer->param);
matmul_param->dynamic_range_quantized = true;
std::shared_ptr<LayerResource> layer_resource = nullptr;
if (resource_map.find(layer->name) != resource_map.end()) {
layer_resource = resource_map[layer->name];
}
RawBuffer quant_buf;
RawBuffer scale_buf;
auto matmul_resource = std::dynamic_pointer_cast<InnerProductLayerResource>(layer_resource);
PerTensorQuant(matmul_resource->weight_handle, quant_buf, scale_buf);
matmul_resource->weight_handle = quant_buf;
matmul_resource->scale_handle = scale_buf;
return TNN_OK;
}

} // namespace TNN_NS
3 changes: 3 additions & 0 deletions tools/dynamic_range_quantization/dynamic_range_quantization.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class DynamicRangeQuantizer {
Status QuantMatMul(std::shared_ptr<LayerInfo>& layer,
std::map<std::string, std::shared_ptr<LayerResource>>& resource_map,
std::map<std::string, std::shared_ptr<RawBuffer>>& constant_map);
Status QuantInnerProduct(std::shared_ptr<LayerInfo>& layer,
std::map<std::string, std::shared_ptr<LayerResource>>& resource_map,
std::map<std::string, std::shared_ptr<RawBuffer>>& constant_map);
Status PerChannelQuant(RawBuffer& weight_buf, RawBuffer& quant_buf, RawBuffer& scale_buf, int num_kernel);
Status PerTensorQuant(RawBuffer& weight_buf, RawBuffer& quant_buf, RawBuffer& scale_buf);

Expand Down