Feature opencl kernel opt merge (#672)

* [OPENCL][INIT] init for winograd * [OPENCL][WINOGRAD] add simple kernel for winograd * [OPENCL][WINOGRAD] fix opencl compile error * [OPENCL][WINOGRAD] fix opencl winograd enqueue error * [OPENCL][WINOGRAD] fix opencl winograd some result error * [OPENCL][WINOGRAD] fix opencl kernel error * [ARM][WINOGRAD] fix arm winograd compile error * [OPENCL][WINOGRAD] fix opencl logic error * [OPENCL][WINOGRAD] fix winograd error * [OPENCL][WINOGRAD] opencl winograd debug * [OPENCL][WINOGRAD] support tensorform inner * Fix typo. (#590) Co-authored-by: E00216 <hcxiong@birentech.com> * [BUG] fix layer resource count error when packing model (#592) Co-authored-by: lucasktian <lucasktian@tencent.com> * [OPENCL][WINOGRAD] filter for some case, need opt later * Fix typo. (#594) * [OPENCL][WINOGRAD] support innerproduct 4*4 * [OPENCL][WINOGRAD] opt for winograd some case * [OPENCL][WINOGRAD] add transform outer * [OPENCL][WINOGRAD] opt for winograd * [CORE][TUNE] support tune kernel and tune opencl * [OPENCL][WINOGRAD] refactor for winograd * [OPENCL][WINOGRAD] fix compile error * [OPENCL][WINOGRAD] add double check * [OPENCL][WINOGRAD] support more cases * [BENCHMARK][MODEL] add densenet and inception v4 model * [OPENCL][CACHE] store opencl tune to cache * [OPENCL][CACHE] add mutex for multithread * [CACHE][OPT] add cache prefix * [CACHE][REFACTOR] generate cache file name * [ANDROID] TNN Benchmark Tools support APP * [ANDROID] add CMakeLists in benchmark app * [ANDROID] update gradle version for linux support * [ANDROID] benchmark app docs add * [ANDROID] modify doc * [ANDROID] remove redundant lib in benchmark app * [benchmark][model] rename shufflenet_v2 * [ANDROID] modify benchmark model script && update doc * [OPENCL][TUNE] support tune softmax and pooling layer * [TUNE][OPT] opt local tune sleep * [BENCHMARK][TEST] prevent overheating and frequency reduction * [ANDROID] remove redundant debug log * [BNECHMARK][APP]解决benchmark 输出异常 * [DOC][BENCHMARK] update benchmark doc * remove redundant log output * [BENCHMARK][UNINSTALL] unisntall benchmark app * [ANDROID] update android sdk download link * fix xcode build error * [DEVICE][ARM] fix include error * [BENCHMARK][APP] support load so from external * [BENCHMARK][FIX] fix benchmark app error * [BENCHMARK][APP] 支持传递参数 * [BENCHMARK][APP] support huawei_npu * [BENCHMARK][APP] 模型通过参数传入 * [BENCHMARK][APP] remove unused * [BENCHMARK][APP] remove log and set target sdk version * [BENCHMARK][APP] remove unused * [BENCHMARK][APP] fix benchmark backgroud error * update benchmark model script * [OPENCL] optimize conv perf with local mem && depthwise mix * [OPENCL] fix conv activation fuse && fix conv wb1 kernel * [BENCHMARK][APP] fix export singed app error * [BENCHMARK][APP] catch launch benchmark app from unexpected intent exception * [BENCHMARK][APP] support benchmark in app or shell * [BENCHMARK][APP] remove unused * [OPENCL] fix conv opt for opencl * [BENCHMARK][APP] profiling only in shell for print * [BENCHMARK][APP] update doc and change benchmark shell logic * [BENCHMARK][APP] fix logcat latency * [BENCHMARK][APP] fix benchmark app some case * fix benchmark result overlap * fix merge bug * [OPENCL] remove buffer part in depthwise conv * [OPENCL] remove logic with buffer in depthwise conv Co-authored-by: neiltian <neiltian@tencent.com> Co-authored-by: Xionghc <xionghcx@gmail.com> Co-authored-by: E00216 <hcxiong@birentech.com> Co-authored-by: ShaunDai <66760945+shaundai-tencent@users.noreply.github.com> Co-authored-by: lucasktian <lucasktian@tencent.com> Co-authored-by: neiltian <65950677+neiltian-tencent@users.noreply.github.com>
Tencent · Dec 29, 2020 · aedc6c8 · aedc6c8
1 parent 0003ac9
commit aedc6c8
Show file tree

Hide file tree

Showing 9 changed files with 270 additions and 35 deletions.
diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc
@@ -17,6 +17,9 @@
 
 namespace TNN_NS {
 
+#define LowOpHWThre 9
+#define HighOpIntensityThre 128
+
 // magic number
 static const uint32_t lws_limit         = 128;
 
@@ -53,9 +56,14 @@ Status OpenCLConvLayer1x1Acc::Init(Context *context, LayerParam *param, LayerRes
     ret = AllocateWeightsBias(resource);
     CHECK_TNN_OK(ret)
 
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
     auto output_dims = outputs[0]->GetBlobDesc().dims;
 
-    const int output_channel = output_dims[1];
+    const int input_channel     = input_dims[1];
+    const int output_batch      = output_dims[0];
+    const int output_channel    = output_dims[1];
+    const int output_height     = output_dims[2];
+    const int output_width      = output_dims[3];
 
     std::string kernel_name;
     if (run_3d_ndrange_) {
@@ -70,7 +78,18 @@ Status OpenCLConvLayer1x1Acc::Init(Context *context, LayerParam *param, LayerRes
         kernel_name += "_MIX";
     }
 
-    if (output_channel > 4 && run_3d_ndrange_ && !use_buffer_) {
+    uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
+    int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width;
+    int task_size_per_cu = task_size / static_cast<int>(compute_units);
+    if (!run_3d_ndrange_ && stride_is_1_ && use_buffer_ && task_size_per_cu < 256) {
+        width_blocking_is_1_ = true;
+        kernel_name += "_WB1";
+        run_local_work_ = (UP_DIV(input_channel, 4) >= HighOpIntensityThre) &&
+                          (output_height * output_width <= LowOpHWThre);
+        if (run_local_work_) {
+            kernel_name += "_Local";
+        }
+    } else if (output_channel > 4 && run_3d_ndrange_ && !use_buffer_) {
         is_channel_blocking_ = true;
         kernel_name += "_CB2";
     }
@@ -99,6 +118,13 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector<Blob *> &inputs, const s
     const int output_channels = output_dims[1];
     const int output_channel_blocks = UP_DIV(output_channels, 4);
 
+    int type_size = sizeof(float);
+    if (OpenCLRuntime::GetInstance()->GetPrecision() != PRECISION_HIGH) {
+        type_size = 2;
+    }
+    auto &unit              = execute_units_[0];
+    uint32_t workgroup_size = 0;
+
     if (run_3d_ndrange_) {
         if (is_channel_blocking_) {
             execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(output_dims[1], 8)),
@@ -114,7 +140,23 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector<Blob *> &inputs, const s
             Conv2d1x1LocalWS3D(execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max);
 
     } else {
-        if (is_channel_blocking_) {
+         if (width_blocking_is_1_) {
+            if (run_local_work_) {
+                auto &unit              = execute_units_[0];
+                workgroup_size = std::min(static_cast<uint32_t>(unit.local_mem_size / (4 * type_size)),
+                                          unit.workgroupsize_max);
+                workgroup_size = std::min(static_cast<uint32_t>(input_channel_blocks), workgroup_size);
+                int temp_size = 1;
+                while ((temp_size <<= 1) <= workgroup_size);
+                workgroup_size = temp_size >> 1;
+
+                execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(output_dims[1], 4) * output_dims[3] * workgroup_size),
+                                                      static_cast<uint32_t>(output_dims[0] * output_dims[2])};
+            } else {
+                execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(output_dims[1], 4) * output_dims[3]),
+                                                      static_cast<uint32_t>(output_dims[0] * output_dims[2])};
+            }
+        } else if (is_channel_blocking_) {
             execute_units_[0].global_work_size = {
                 static_cast<uint32_t>(UP_DIV(output_dims[1], 8) * UP_DIV(output_dims[3], 4)),
                 static_cast<uint32_t>(output_dims[0] * output_dims[2])};
@@ -124,8 +166,12 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector<Blob *> &inputs, const s
                 static_cast<uint32_t>(output_dims[0] * output_dims[2])};
         }
 
-        execute_units_[0].local_work_size = Conv2dCommonLocalWS2D(
-            execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size);
+        if (!run_local_work_) {
+            execute_units_[0].local_work_size = Conv2dCommonLocalWS2D(
+                execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size);
+        } else {
+            execute_units_[0].local_work_size = {workgroup_size, 1};
+        }
     }
     //input width, input height
     int input_imageshape[2]  = {input_dims[3], input_dims[2]};
@@ -154,10 +200,17 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector<Blob *> &inputs, const s
         execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape);
         execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape);
     }
-    // set value (output widht / 4)
-    execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(output_dims[3], 4));
+    if (!width_blocking_is_1_) {
+        // set value (output width / 4)
+        execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(output_dims[3], 4));
+    }
+
+    if (run_local_work_) {
+        execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(input_channel_blocks, workgroup_size));
+        execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+    }
 
-    if (ocl_context_->GetEnableTuneKernel()) {
+    if (!run_local_work_ && ocl_context_->GetEnableTuneKernel()) {
         execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0]));
     }
 

diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h
@@ -35,6 +35,8 @@ class OpenCLConvLayer1x1Acc : public OpenCLConvLayerAccImpl {
     std::vector<uint32_t> Conv2d1x1LocalWS3D(std::vector<uint32_t> &gws, const uint32_t max_workgroup_size);
 
     bool stride_is_1_ = false;
+    bool width_blocking_is_1_ = false;
+    bool run_local_work_ = false;
 };
 
 }  // namespace TNN_NS

diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc
@@ -163,8 +163,14 @@ Status OpenCLConvLayerAccImpl::ConvertWeights(float *weights_data_ptr) {
     if (use_buffer_) {
         // create weights use clBuffer
         DimsVector filter_buffershape;
-        filter_buffershape = {ROUND_UP(conv_params_.output_channel, 4), ROUND_UP(conv_params_.input_channel, 4),
-                              conv_params_.kernel_y, conv_params_.kernel_x};
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            filter_buffershape = {1, ROUND_UP(conv_params_.output_channel, 4),
+                                  conv_params_.kernel_y, conv_params_.kernel_x};
+        } else {
+            filter_buffershape = {ROUND_UP(conv_params_.output_channel, 4), ROUND_UP(conv_params_.input_channel, 4),
+                                  conv_params_.kernel_y, conv_params_.kernel_x};
+        }
+
         ocl_weights_.reset(new OpenCLMemory(TNN_CL_BUFFER));
         size_t type_size = sizeof(float);
         if (opencl_runtime->GetPrecision() != PRECISION_HIGH)
@@ -182,7 +188,11 @@ Status OpenCLConvLayerAccImpl::ConvertWeights(float *weights_data_ptr) {
 
         // transfer from clBuffer to clBuffer
         ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue());
-        return convertor.ConvertBufferToBuffer(weight_memory.get(), CONV2D_FILTER, filter_shape, ocl_weights_.get(),
+        OpenCLBufferFormat buffer_format = CONV2D_FILTER;
+        if (CT_CONV_DEPTHWISE == conv_type_) {
+            buffer_format = DW_CONV2D_FILTER;
+        }
+        return convertor.ConvertBufferToBuffer(weight_memory.get(), buffer_format, filter_shape, ocl_weights_.get(),
                                                true);
     } else {
         // create weights use clImage

diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc
@@ -63,7 +63,7 @@ Status OpenCLConvLayerCommonAcc::Init(Context *context, LayerParam *param, Layer
         if (use_buffer_) {
             kernel_name += "_MIX";
         }
-        int task_size = output_batch * output_channel * output_height * output_width;
+        int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width;
         if (task_size > 4096 && output_channel > 4) {
             is_channel_blocking_ = true;
             kernel_name += "_CB2";

diff --git a/source/tnn/device/opencl/cl/base.inc b/source/tnn/device/opencl/cl/base.inc
@@ -30,6 +30,12 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
     out##i = mad(in##i.z, weights2, out##i); \
     out##i = mad(in##i.w, weights3, out##i);
 
+#define CALCULATE_VEC16_OUTPUT(i)                  \
+    out##i = mad(in##i.x, weights.s0123, out##i); \
+    out##i = mad(in##i.y, weights.s4567, out##i); \
+    out##i = mad(in##i.z, weights.s89ab, out##i); \
+    out##i = mad(in##i.w, weights.scdef, out##i);
+
 #define CALCULATE_SLICE_OUTPUT(s_idx) \
     out_w0_s##s_idx += weights_c0_s##s_idx * in0.x; \
     out_w1_s##s_idx += weights_c0_s##s_idx * in1.x; \

diff --git a/source/tnn/device/opencl/cl/buffer_to_buffer.cl b/source/tnn/device/opencl/cl/buffer_to_buffer.cl
@@ -28,6 +28,60 @@ __kernel void Conv2DFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float
     output_ptr[output_offset] = (FLOAT)(val);
 }
 
+// convert kernel : from buffer(mihw) to buffer(ic/4 h w m ic4)
+// but now dw only support m == 1
+__kernel void DWFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr,
+                                     __private const int4 kernel_shape, __private const int height_width_size,
+                                     __global FLOAT* output_ptr) {
+    const int hw_idx  = get_global_id(0);
+    const int ic_block_idx = get_global_id(1);
+
+    DEAL_NON_UNIFORM_DIM2(hw_idx, ic_block_idx);
+
+    FLOAT4 output_values = 0;
+    if (kernel_shape.x == 1) {
+        const int input_channel_4_idx = ic_block_idx << 2;
+        const int buffer_height_idx   = hw_idx / kernel_shape.w;
+        const int buffer_width_idx    = hw_idx % kernel_shape.w;
+
+        const int buffer_offset =
+            mad24(mad24(input_channel_4_idx, kernel_shape.z, buffer_height_idx), kernel_shape.w, buffer_width_idx);
+
+        const int remain_channel = kernel_shape.y - input_channel_4_idx;
+        if (input_channel_4_idx < kernel_shape.y) {
+            if (remain_channel >= 4) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.z = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.w = (FLOAT)(*(input_ptr + offset));
+            } else if (remain_channel == 3) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.z = (FLOAT)(*(input_ptr + offset));
+
+            } else if (remain_channel == 2) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+                offset += height_width_size;
+                output_values.y = (FLOAT)(*(input_ptr + offset));
+            } else if (remain_channel == 1) {
+                int offset      = buffer_offset;
+                output_values.x = (FLOAT)(*(input_ptr + offset));
+            }
+        }
+        const int output_offset = mad24(mad24(ic_block_idx, kernel_shape.z, buffer_height_idx),
+                                        kernel_shape.w, buffer_width_idx) << 2;
+        vstore4(output_values, 0, output_ptr + output_offset);
+    }
+}
+
 // convert arg as 4 alignment
 __kernel void ArgBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int count,
                                 __global FLOAT* output_ptr) {