Skip to content

Commit

Permalink
[DEVICE][OPENCL] 优化3*3卷积和非对称卷积 (#515)
Browse files Browse the repository at this point in the history
  • Loading branch information
neiltian-tencent committed Nov 3, 2020
1 parent c8100db commit 72c118d
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,36 @@ std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS2D(std::vector<
}

// local size 3d calculate, special for conv default.
std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3D(std::vector<uint32_t> &gws,
std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DKernel3x3(std::vector<uint32_t> &gws,
const uint32_t kernel_size,
const uint32_t max_workgroup_size) {
uint32_t compute_units = std::max<uint32_t>(OpenCLRuntime::GetInstance()->DeviceComputeUnits() / 2, 1);
uint64_t cache_size = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize();
const uint32_t base = std::max<uint32_t>(std::min<uint32_t>(cache_size / g_base_gpu_mem_cachesize, 4), 1);
std::vector<uint32_t> lws(3, 1);
if (max_workgroup_size > 0) {
lws[1] = std::min<uint32_t>(gws[1], max_workgroup_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base), max_workgroup_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];

lws[2] = std::min<uint32_t>(ROUND_UP(cache_size / kernel_cache_size / lws_size / compute_units, base), gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], max_workgroup_size / lws_size), 1);
}

LOGD("compute_units : %d , max_workgroup_size : %d\n", compute_units, max_workgroup_size);
LOGD("layer: %s conv_common [%d, %d, %d] -- [%d, %d, %d] \n", layer_name_.c_str(), gws[0], gws[1], gws[2], lws[0],
lws[1], lws[2]);
return lws;
}

// local size 3d calculate, special for conv default.
std::vector<uint32_t> OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DGeneral(std::vector<uint32_t> &gws,
const uint32_t kernel_size,
const uint32_t max_workgroup_size) {

uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits();
uint64_t cache_size = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize();
const uint32_t base = std::max<uint32_t>(cache_size / g_base_gpu_mem_cachesize, 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ class OpenCLConvLayerAccImpl : public OpenCLLayerAcc {
Status AllocateWeightsBias(LayerResource *resource);
std::vector<uint32_t> Conv2dCommonLocalWS2D(std::vector<uint32_t> &gws, const uint32_t max_workgroup_size,
const uint32_t subgroup_size = 0);
std::vector<uint32_t> Conv2dCommonLocalWS3D(std::vector<uint32_t> &gws, const uint32_t kernel_size,
std::vector<uint32_t> Conv2dCommonLocalWS3DGeneral(std::vector<uint32_t> &gws, const uint32_t kernel_size,
const uint32_t max_workgroup_size);

std::vector<uint32_t> Conv2dCommonLocalWS3DKernel3x3(std::vector<uint32_t> &gws, const uint32_t kernel_size,
const uint32_t max_workgroup_size);

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h"
#include "tnn/device/opencl/imagebuffer_convertor.h"
#include "tnn/utils/string_utils_inner.h"

namespace TNN_NS {

Expand All @@ -30,12 +31,16 @@ Status OpenCLConvLayerCommonAcc::Init(Context *context, LayerParam *param, Layer
const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
LOGD("Init Conv Common Acc\n");

conv_type_ = CT_CONV_COMMON;
op_name_ = "Conv";

Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs);
CHECK_TNN_OK(ret)

conv_type_ = CT_CONV_COMMON;
op_name_ = "Conv_" + ToString(conv_params_.kernel_x) + "x" + ToString(conv_params_.kernel_y);

if(conv_params_.kernel_x != conv_params_.kernel_y) {
run_3d_ndrange_ = false;
}

ret = AllocateWeightsBias(resource);
CHECK_TNN_OK(ret)

Expand Down Expand Up @@ -82,8 +87,13 @@ Status OpenCLConvLayerCommonAcc::Reshape(const std::vector<Blob *> &inputs, cons
execute_units_[0].global_work_size = {static_cast<uint32_t>(UP_DIV(output_dims[1], 4)),
static_cast<uint32_t>(UP_DIV(output_dims[3], 4)),
static_cast<uint32_t>(output_dims[0] * output_dims[2])};
execute_units_[0].local_work_size = Conv2dCommonLocalWS3D(
execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max);
if(kernel_shape[0] == 3 && kernel_shape[1] == 3) {
execute_units_[0].local_work_size = Conv2dCommonLocalWS3DKernel3x3(
execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max);
} else {
execute_units_[0].local_work_size = Conv2dCommonLocalWS3DGeneral(
execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max);
}
} else {
execute_units_[0].global_work_size = {
static_cast<uint32_t>(UP_DIV(output_dims[1], 4) * UP_DIV(output_dims[3], 4)),
Expand Down

0 comments on commit 72c118d

Please sign in to comment.