core int8 inference, quantize and dequantize, net using flag, caffe2n…

…cnn reads int8 scale table
Tencent · Jul 29, 2018 · a169cec · a169cec
1 parent b6b90c8
commit a169cec
Show file tree

Hide file tree

Showing 26 changed files with 1,265 additions and 58 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -148,6 +148,8 @@ ncnn_add_layer(InstanceNorm)
 ncnn_add_layer(Clip)
 ncnn_add_layer(Reorg)
 ncnn_add_layer(YoloDetectionOutput)
+ncnn_add_layer(Quantize)
+ncnn_add_layer(Dequantize)
 
 add_library(ncnn STATIC ${ncnn_SRCS})
 

diff --git a/src/blob.cpp b/src/blob.cpp
@@ -19,6 +19,7 @@ namespace ncnn {
 Blob::Blob()
 {
     producer = -1;
+    int8_scale = 0.f;
 }
 
 } // namespace ncnn
diff --git a/src/blob.h b/src/blob.h
@@ -36,6 +36,10 @@ class Blob
     int producer;
     // layer index which need this blob as input
     std::vector<int> consumers;
+
+public:
+    // int8 quantize scale of this blob
+    float int8_scale;
 };
 
 } // namespace ncnn

diff --git a/src/layer.h b/src/layer.h
@@ -36,6 +36,9 @@ class Option
     int num_threads;
     Allocator* blob_allocator;
     Allocator* workspace_allocator;
+
+public:
+    std::vector<float> int8_scales;
 };
 
 const Option& get_default_option();

diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
@@ -194,6 +194,12 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     // convolv with NxN kernel
     // value = value + bias
 
+    if (use_int8_inference)
+    {
+        // TODO
+        return Convolution::forward(bottom_blob, top_blob, opt);
+    }
+
     if (bottom_blob.dims != 3)
     {
         return Convolution::forward(bottom_blob, top_blob, opt);

diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -107,6 +107,12 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
     // convolv with NxN kernel
     // value = value + bias
 
+    if (use_int8_inference)
+    {
+        // TODO
+        return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
+    }
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;

diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
@@ -24,6 +24,12 @@ DEFINE_LAYER_CREATOR(InnerProduct_arm)
 
 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+    if (use_int8_inference)
+    {
+        // TODO
+        return InnerProduct::forward(bottom_blob, top_blob, opt);
+    }
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;

diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
@@ -24,6 +24,15 @@ Convolution::Convolution()
 {
     one_blob_only = true;
     support_inplace = false;
+
+    quantize = 0;
+    dequantize = 0;
+}
+
+Convolution::~Convolution()
+{
+    delete quantize;
+    delete dequantize;
 }
 
 int Convolution::load_param(const ParamDict& pd)
@@ -39,6 +48,9 @@ int Convolution::load_param(const ParamDict& pd)
     pad_h = pd.get(14, pad_w);
     bias_term = pd.get(5, 0);
     weight_data_size = pd.get(6, 0);
+    weight_data_int8_scale = pd.get(8, 0.f);
+
+    use_int8_inference = pd.use_int8_inference;
 
     return 0;
 }
@@ -56,6 +68,46 @@ int Convolution::load_model(const ModelBin& mb)
             return -100;
     }
 
+    bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
+    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
+
+    if (weight_data_is_int8 && !use_int8_inference)
+    {
+        fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
+        return -1;
+    }
+
+    if (use_int8_inference)
+    {
+        quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
+        dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
+    }
+
+    if (weight_data_is_float32 && use_int8_inference)
+    {
+        if (weight_data_int8_scale != 0.f)
+        {
+            // quantize weight to int8
+            ncnn::ParamDict pd;
+            pd.set(0, weight_data_int8_scale);// scale
+
+            quantize->load_param(pd);
+
+            Mat int8_weight_data;
+            quantize->forward(weight_data, int8_weight_data);
+
+            if (int8_weight_data.empty())
+                return -100;
+
+            weight_data = int8_weight_data;
+        }
+        else
+        {
+            // plain float32 weight, fallback to float32 inference
+            use_int8_inference = false;
+        }
+    }
+
     return 0;
 }
 
@@ -78,6 +130,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
             pd.set(0, num_output);
             pd.set(1, bias_term);
             pd.set(2, weight_data_size);
+            pd.set(8, weight_data_int8_scale);
+
+            pd.use_int8_inference = use_int8_inference;
 
             op->load_param(pd);
 
@@ -160,6 +215,85 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         }
     }
 
+    if (use_int8_inference)
+    {
+        Mat bottom_blob_bordered_int8;
+        bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
+        if (bottom_blob_bordered_int8.empty())
+            return -100;
+
+        float bottom_scale = opt.int8_scales[0];
+//         fprintf(stderr, "bottom_scale = %f\n", bottom_scale);
+
+        // quantize, scale and round to nearest
+        {
+            ncnn::ParamDict pd;
+            pd.set(0, bottom_scale);// scale
+
+            quantize->load_param(pd);
+
+            quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
+        }
+
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<num_output; p++)
+        {
+            int* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    int sum = 0;
+
+                    const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
+
+                    // channels
+                    for (int q=0; q<channels; q++)
+                    {
+                        const Mat m = bottom_blob_bordered_int8.channel(q);
+                        const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            int val = sptr[ space_ofs[k] ];
+                            int w = kptr[k];
+                            sum += val * w;
+                        }
+
+                        kptr += maxk;
+                    }
+
+                    outptr[j] = sum;
+                }
+
+                outptr += outw;
+            }
+        }
+
+        // dequantize, reverse scale inplace
+        {
+            float top_rescale = 1.f / (bottom_scale * weight_data_int8_scale);
+
+            ncnn::ParamDict pd;
+            pd.set(0, top_rescale);// scale
+            pd.set(1, bias_term);// bias_term
+            pd.set(2, num_output);// bias_data_size
+
+            dequantize->load_param(pd);
+
+            ncnn::Mat weights[1];
+            weights[0] = bias_data;
+
+            dequantize->load_model(ModelBinFromMatArray(weights));
+
+            dequantize->forward_inplace(top_blob, opt);
+        }
+
+        return 0;
+    }
+
     // num_output
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=0; p<num_output; p++)

diff --git a/src/layer/convolution.h b/src/layer/convolution.h
@@ -23,6 +23,7 @@ class Convolution : public Layer
 {
 public:
     Convolution();
+    ~Convolution();
 
     virtual int load_param(const ParamDict& pd);
 
@@ -44,10 +45,16 @@ class Convolution : public Layer
     int bias_term;
 
     int weight_data_size;
+    float weight_data_int8_scale;
 
     // model
     Mat weight_data;
     Mat bias_data;
+
+    bool use_int8_inference;
+
+    ncnn::Layer* quantize;
+    ncnn::Layer* dequantize;
 };
 
 } // namespace ncnn