Skip to content

Commit

Permalink
core int8 inference, quantize and dequantize, net using flag, caffe2n…
Browse files Browse the repository at this point in the history
…cnn reads int8 scale table
  • Loading branch information
nihui committed Jul 29, 2018
1 parent b6b90c8 commit a169cec
Show file tree
Hide file tree
Showing 26 changed files with 1,265 additions and 58 deletions.
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Expand Up @@ -148,6 +148,8 @@ ncnn_add_layer(InstanceNorm)
ncnn_add_layer(Clip)
ncnn_add_layer(Reorg)
ncnn_add_layer(YoloDetectionOutput)
ncnn_add_layer(Quantize)
ncnn_add_layer(Dequantize)

add_library(ncnn STATIC ${ncnn_SRCS})

Expand Down
1 change: 1 addition & 0 deletions src/blob.cpp
Expand Up @@ -19,6 +19,7 @@ namespace ncnn {
Blob::Blob()
{
producer = -1;
int8_scale = 0.f;
}

} // namespace ncnn
4 changes: 4 additions & 0 deletions src/blob.h
Expand Up @@ -36,6 +36,10 @@ class Blob
int producer;
// layer index which need this blob as input
std::vector<int> consumers;

public:
// int8 quantize scale of this blob
float int8_scale;
};

} // namespace ncnn
Expand Down
3 changes: 3 additions & 0 deletions src/layer.h
Expand Up @@ -36,6 +36,9 @@ class Option
int num_threads;
Allocator* blob_allocator;
Allocator* workspace_allocator;

public:
std::vector<float> int8_scales;
};

const Option& get_default_option();
Expand Down
6 changes: 6 additions & 0 deletions src/layer/arm/convolution_arm.cpp
Expand Up @@ -194,6 +194,12 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
// convolv with NxN kernel
// value = value + bias

if (use_int8_inference)
{
// TODO
return Convolution::forward(bottom_blob, top_blob, opt);
}

if (bottom_blob.dims != 3)
{
return Convolution::forward(bottom_blob, top_blob, opt);
Expand Down
6 changes: 6 additions & 0 deletions src/layer/arm/convolutiondepthwise_arm.cpp
Expand Up @@ -107,6 +107,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
// convolv with NxN kernel
// value = value + bias

if (use_int8_inference)
{
// TODO
return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
Expand Down
6 changes: 6 additions & 0 deletions src/layer/arm/innerproduct_arm.cpp
Expand Up @@ -24,6 +24,12 @@ DEFINE_LAYER_CREATOR(InnerProduct_arm)

int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
if (use_int8_inference)
{
// TODO
return InnerProduct::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
Expand Down
134 changes: 134 additions & 0 deletions src/layer/convolution.cpp
Expand Up @@ -24,6 +24,15 @@ Convolution::Convolution()
{
one_blob_only = true;
support_inplace = false;

quantize = 0;
dequantize = 0;
}

Convolution::~Convolution()
{
delete quantize;
delete dequantize;
}

int Convolution::load_param(const ParamDict& pd)
Expand All @@ -39,6 +48,9 @@ int Convolution::load_param(const ParamDict& pd)
pad_h = pd.get(14, pad_w);
bias_term = pd.get(5, 0);
weight_data_size = pd.get(6, 0);
weight_data_int8_scale = pd.get(8, 0.f);

use_int8_inference = pd.use_int8_inference;

return 0;
}
Expand All @@ -56,6 +68,46 @@ int Convolution::load_model(const ModelBin& mb)
return -100;
}

bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);

if (weight_data_is_int8 && !use_int8_inference)
{
fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
return -1;
}

if (use_int8_inference)
{
quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
}

if (weight_data_is_float32 && use_int8_inference)
{
if (weight_data_int8_scale != 0.f)
{
// quantize weight to int8
ncnn::ParamDict pd;
pd.set(0, weight_data_int8_scale);// scale

quantize->load_param(pd);

Mat int8_weight_data;
quantize->forward(weight_data, int8_weight_data);

if (int8_weight_data.empty())
return -100;

weight_data = int8_weight_data;
}
else
{
// plain float32 weight, fallback to float32 inference
use_int8_inference = false;
}
}

return 0;
}

Expand All @@ -78,6 +130,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
pd.set(0, num_output);
pd.set(1, bias_term);
pd.set(2, weight_data_size);
pd.set(8, weight_data_int8_scale);

pd.use_int8_inference = use_int8_inference;

op->load_param(pd);

Expand Down Expand Up @@ -160,6 +215,85 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
}
}

if (use_int8_inference)
{
Mat bottom_blob_bordered_int8;
bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
if (bottom_blob_bordered_int8.empty())
return -100;

float bottom_scale = opt.int8_scales[0];
// fprintf(stderr, "bottom_scale = %f\n", bottom_scale);

// quantize, scale and round to nearest
{
ncnn::ParamDict pd;
pd.set(0, bottom_scale);// scale

quantize->load_param(pd);

quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;

const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

// channels
for (int q=0; q<channels; q++)
{
const Mat m = bottom_blob_bordered_int8.channel(q);
const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;

for (int k = 0; k < maxk; k++)
{
int val = sptr[ space_ofs[k] ];
int w = kptr[k];
sum += val * w;
}

kptr += maxk;
}

outptr[j] = sum;
}

outptr += outw;
}
}

// dequantize, reverse scale inplace
{
float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, num_output);// bias_data_size

dequantize->load_param(pd);

ncnn::Mat weights[1];
weights[0] = bias_data;

dequantize->load_model(ModelBinFromMatArray(weights));

dequantize->forward_inplace(top_blob, opt);
}

return 0;
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p<num_output; p++)
Expand Down
7 changes: 7 additions & 0 deletions src/layer/convolution.h
Expand Up @@ -23,6 +23,7 @@ class Convolution : public Layer
{
public:
Convolution();
~Convolution();

virtual int load_param(const ParamDict& pd);

Expand All @@ -44,10 +45,16 @@ class Convolution : public Layer
int bias_term;

int weight_data_size;
float weight_data_int8_scale;

// model
Mat weight_data;
Mat bias_data;

bool use_int8_inference;

ncnn::Layer* quantize;
ncnn::Layer* dequantize;
};

} // namespace ncnn
Expand Down

0 comments on commit a169cec

Please sign in to comment.