diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc index 853a2e50e..d7a5c3d49 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc @@ -238,9 +238,36 @@ std::vector OpenCLConvLayerAccImpl::Conv2dCommonLocalWS2D(std::vector< } // local size 3d calculate, special for conv default. -std::vector OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3D(std::vector &gws, +std::vector OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DKernel3x3(std::vector &gws, const uint32_t kernel_size, const uint32_t max_workgroup_size) { + uint32_t compute_units = std::max(OpenCLRuntime::GetInstance()->DeviceComputeUnits() / 2, 1); + uint64_t cache_size = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize(); + const uint32_t base = std::max(std::min(cache_size / g_base_gpu_mem_cachesize, 4), 1); + std::vector lws(3, 1); + if (max_workgroup_size > 0) { + lws[1] = std::min(gws[1], max_workgroup_size); + lws[0] = std::min(std::min(gws[0], base), max_workgroup_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + + lws[2] = std::min(ROUND_UP(cache_size / kernel_cache_size / lws_size / compute_units, base), gws[2]); + if (lws[2] == 0) { + lws[2] = std::min(gws[2], base); + } + lws[2] = std::max(std::min(lws[2], max_workgroup_size / lws_size), 1); + } + + LOGD("compute_units : %d , max_workgroup_size : %d\n", compute_units, max_workgroup_size); + LOGD("layer: %s conv_common [%d, %d, %d] -- [%d, %d, %d] \n", layer_name_.c_str(), gws[0], gws[1], gws[2], lws[0], + lws[1], lws[2]); + return lws; +} + +// local size 3d calculate, special for conv default. +std::vector OpenCLConvLayerAccImpl::Conv2dCommonLocalWS3DGeneral(std::vector &gws, + const uint32_t kernel_size, + const uint32_t max_workgroup_size) { + uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits(); uint64_t cache_size = OpenCLRuntime::GetInstance()->DeviceGlobalMemeryCacheSize(); const uint32_t base = std::max(cache_size / g_base_gpu_mem_cachesize, 1); diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h index a5d12cdf2..fa77070cf 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.h @@ -55,7 +55,10 @@ class OpenCLConvLayerAccImpl : public OpenCLLayerAcc { Status AllocateWeightsBias(LayerResource *resource); std::vector Conv2dCommonLocalWS2D(std::vector &gws, const uint32_t max_workgroup_size, const uint32_t subgroup_size = 0); - std::vector Conv2dCommonLocalWS3D(std::vector &gws, const uint32_t kernel_size, + std::vector Conv2dCommonLocalWS3DGeneral(std::vector &gws, const uint32_t kernel_size, + const uint32_t max_workgroup_size); + + std::vector Conv2dCommonLocalWS3DKernel3x3(std::vector &gws, const uint32_t kernel_size, const uint32_t max_workgroup_size); private: diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc index 61db9d9f6..805dcca85 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc @@ -14,6 +14,7 @@ #include "tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.h" #include "tnn/device/opencl/imagebuffer_convertor.h" +#include "tnn/utils/string_utils_inner.h" namespace TNN_NS { @@ -30,12 +31,16 @@ Status OpenCLConvLayerCommonAcc::Init(Context *context, LayerParam *param, Layer const std::vector &inputs, const std::vector &outputs) { LOGD("Init Conv Common Acc\n"); - conv_type_ = CT_CONV_COMMON; - op_name_ = "Conv"; - Status ret = OpenCLConvLayerAccImpl::Init(context, param, resource, inputs, outputs); CHECK_TNN_OK(ret) + conv_type_ = CT_CONV_COMMON; + op_name_ = "Conv_" + ToString(conv_params_.kernel_x) + "x" + ToString(conv_params_.kernel_y); + + if(conv_params_.kernel_x != conv_params_.kernel_y) { + run_3d_ndrange_ = false; + } + ret = AllocateWeightsBias(resource); CHECK_TNN_OK(ret) @@ -82,8 +87,13 @@ Status OpenCLConvLayerCommonAcc::Reshape(const std::vector &inputs, cons execute_units_[0].global_work_size = {static_cast(UP_DIV(output_dims[1], 4)), static_cast(UP_DIV(output_dims[3], 4)), static_cast(output_dims[0] * output_dims[2])}; - execute_units_[0].local_work_size = Conv2dCommonLocalWS3D( - execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max); + if(kernel_shape[0] == 3 && kernel_shape[1] == 3) { + execute_units_[0].local_work_size = Conv2dCommonLocalWS3DKernel3x3( + execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max); + } else { + execute_units_[0].local_work_size = Conv2dCommonLocalWS3DGeneral( + execute_units_[0].global_work_size, kernel_shape[0] * kernel_shape[1], execute_units_[0].workgroupsize_max); + } } else { execute_units_[0].global_work_size = { static_cast(UP_DIV(output_dims[1], 4) * UP_DIV(output_dims[3], 4)),