diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc index 87392a70e..5fd3d33a5 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.cc @@ -17,6 +17,9 @@ namespace TNN_NS { +#define LowOpHWThre 9 +#define HighOpIntensityThre 128 + // magic number static const uint32_t lws_limit = 128; @@ -53,9 +56,14 @@ Status OpenCLConvLayer1x1Acc::Init(Context *context, LayerParam *param, LayerRes ret = AllocateWeightsBias(resource); CHECK_TNN_OK(ret) + auto input_dims = inputs[0]->GetBlobDesc().dims; auto output_dims = outputs[0]->GetBlobDesc().dims; - const int output_channel = output_dims[1]; + const int input_channel = input_dims[1]; + const int output_batch = output_dims[0]; + const int output_channel = output_dims[1]; + const int output_height = output_dims[2]; + const int output_width = output_dims[3]; std::string kernel_name; if (run_3d_ndrange_) { @@ -70,7 +78,18 @@ Status OpenCLConvLayer1x1Acc::Init(Context *context, LayerParam *param, LayerRes kernel_name += "_MIX"; } - if (output_channel > 4 && run_3d_ndrange_ && !use_buffer_) { + uint32_t compute_units = OpenCLRuntime::GetInstance()->DeviceComputeUnits(); + int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width; + int task_size_per_cu = task_size / static_cast(compute_units); + if (!run_3d_ndrange_ && stride_is_1_ && use_buffer_ && task_size_per_cu < 256) { + width_blocking_is_1_ = true; + kernel_name += "_WB1"; + run_local_work_ = (UP_DIV(input_channel, 4) >= HighOpIntensityThre) && + (output_height * output_width <= LowOpHWThre); + if (run_local_work_) { + kernel_name += "_Local"; + } + } else if (output_channel > 4 && run_3d_ndrange_ && !use_buffer_) { is_channel_blocking_ = true; kernel_name += "_CB2"; } @@ -99,6 +118,13 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector &inputs, const s const int output_channels = output_dims[1]; const int output_channel_blocks = UP_DIV(output_channels, 4); + int type_size = sizeof(float); + if (OpenCLRuntime::GetInstance()->GetPrecision() != PRECISION_HIGH) { + type_size = 2; + } + auto &unit = execute_units_[0]; + uint32_t workgroup_size = 0; + if (run_3d_ndrange_) { if (is_channel_blocking_) { execute_units_[0].global_work_size = {static_cast(UP_DIV(output_dims[1], 8)), @@ -114,7 +140,23 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector &inputs, const s Conv2d1x1LocalWS3D(execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max); } else { - if (is_channel_blocking_) { + if (width_blocking_is_1_) { + if (run_local_work_) { + auto &unit = execute_units_[0]; + workgroup_size = std::min(static_cast(unit.local_mem_size / (4 * type_size)), + unit.workgroupsize_max); + workgroup_size = std::min(static_cast(input_channel_blocks), workgroup_size); + int temp_size = 1; + while ((temp_size <<= 1) <= workgroup_size); + workgroup_size = temp_size >> 1; + + execute_units_[0].global_work_size = {static_cast(UP_DIV(output_dims[1], 4) * output_dims[3] * workgroup_size), + static_cast(output_dims[0] * output_dims[2])}; + } else { + execute_units_[0].global_work_size = {static_cast(UP_DIV(output_dims[1], 4) * output_dims[3]), + static_cast(output_dims[0] * output_dims[2])}; + } + } else if (is_channel_blocking_) { execute_units_[0].global_work_size = { static_cast(UP_DIV(output_dims[1], 8) * UP_DIV(output_dims[3], 4)), static_cast(output_dims[0] * output_dims[2])}; @@ -124,8 +166,12 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector &inputs, const s static_cast(output_dims[0] * output_dims[2])}; } - execute_units_[0].local_work_size = Conv2dCommonLocalWS2D( - execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size); + if (!run_local_work_) { + execute_units_[0].local_work_size = Conv2dCommonLocalWS2D( + execute_units_[0].global_work_size, execute_units_[0].workgroupsize_max, execute_units_[0].sub_group_size); + } else { + execute_units_[0].local_work_size = {workgroup_size, 1}; + } } //input width, input height int input_imageshape[2] = {input_dims[3], input_dims[2]}; @@ -154,10 +200,17 @@ Status OpenCLConvLayer1x1Acc::Reshape(const std::vector &inputs, const s execute_units_[0].ocl_kernel.setArg(idx++, sizeof(output_imageshape), output_imageshape); execute_units_[0].ocl_kernel.setArg(idx++, sizeof(stride_shape), stride_shape); } - // set value (output widht / 4) - execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(output_dims[3], 4)); + if (!width_blocking_is_1_) { + // set value (output width / 4) + execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(output_dims[3], 4)); + } + + if (run_local_work_) { + execute_units_[0].ocl_kernel.setArg(idx++, UP_DIV(input_channel_blocks, workgroup_size)); + execute_units_[0].ocl_kernel.setArg(idx++, workgroup_size * 4 * type_size, nullptr); + } - if (ocl_context_->GetEnableTuneKernel()) { + if (!run_local_work_ && ocl_context_->GetEnableTuneKernel()) { execute_units_[0].local_work_size = LocalTune(execute_units_[0], ocl_context_, GenerateTuneKernelKey(execute_units_[0])); } diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h index 71504a86c..04a6b4ca7 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_1x1_acc.h @@ -35,6 +35,8 @@ class OpenCLConvLayer1x1Acc : public OpenCLConvLayerAccImpl { std::vector Conv2d1x1LocalWS3D(std::vector &gws, const uint32_t max_workgroup_size); bool stride_is_1_ = false; + bool width_blocking_is_1_ = false; + bool run_local_work_ = false; }; } // namespace TNN_NS diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc index 4ab861ca3..bd9868ef7 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_acc_impl.cc @@ -163,8 +163,14 @@ Status OpenCLConvLayerAccImpl::ConvertWeights(float *weights_data_ptr) { if (use_buffer_) { // create weights use clBuffer DimsVector filter_buffershape; - filter_buffershape = {ROUND_UP(conv_params_.output_channel, 4), ROUND_UP(conv_params_.input_channel, 4), - conv_params_.kernel_y, conv_params_.kernel_x}; + if (CT_CONV_DEPTHWISE == conv_type_) { + filter_buffershape = {1, ROUND_UP(conv_params_.output_channel, 4), + conv_params_.kernel_y, conv_params_.kernel_x}; + } else { + filter_buffershape = {ROUND_UP(conv_params_.output_channel, 4), ROUND_UP(conv_params_.input_channel, 4), + conv_params_.kernel_y, conv_params_.kernel_x}; + } + ocl_weights_.reset(new OpenCLMemory(TNN_CL_BUFFER)); size_t type_size = sizeof(float); if (opencl_runtime->GetPrecision() != PRECISION_HIGH) @@ -182,7 +188,11 @@ Status OpenCLConvLayerAccImpl::ConvertWeights(float *weights_data_ptr) { // transfer from clBuffer to clBuffer ImageBufferConvertor convertor(opencl_runtime, ocl_context_->CommandQueue()); - return convertor.ConvertBufferToBuffer(weight_memory.get(), CONV2D_FILTER, filter_shape, ocl_weights_.get(), + OpenCLBufferFormat buffer_format = CONV2D_FILTER; + if (CT_CONV_DEPTHWISE == conv_type_) { + buffer_format = DW_CONV2D_FILTER; + } + return convertor.ConvertBufferToBuffer(weight_memory.get(), buffer_format, filter_shape, ocl_weights_.get(), true); } else { // create weights use clImage diff --git a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc index 5d105ffc4..c06d844f8 100644 --- a/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc +++ b/source/tnn/device/opencl/acc/convolution/opencl_conv_layer_common_acc.cc @@ -63,7 +63,7 @@ Status OpenCLConvLayerCommonAcc::Init(Context *context, LayerParam *param, Layer if (use_buffer_) { kernel_name += "_MIX"; } - int task_size = output_batch * output_channel * output_height * output_width; + int task_size = output_batch * UP_DIV(output_channel, 4) * output_height * output_width; if (task_size > 4096 && output_channel > 4) { is_channel_blocking_ = true; kernel_name += "_CB2"; diff --git a/source/tnn/device/opencl/cl/base.inc b/source/tnn/device/opencl/cl/base.inc index 698792d47..e1f83167f 100644 --- a/source/tnn/device/opencl/cl/base.inc +++ b/source/tnn/device/opencl/cl/base.inc @@ -30,6 +30,12 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | out##i = mad(in##i.z, weights2, out##i); \ out##i = mad(in##i.w, weights3, out##i); +#define CALCULATE_VEC16_OUTPUT(i) \ + out##i = mad(in##i.x, weights.s0123, out##i); \ + out##i = mad(in##i.y, weights.s4567, out##i); \ + out##i = mad(in##i.z, weights.s89ab, out##i); \ + out##i = mad(in##i.w, weights.scdef, out##i); + #define CALCULATE_SLICE_OUTPUT(s_idx) \ out_w0_s##s_idx += weights_c0_s##s_idx * in0.x; \ out_w1_s##s_idx += weights_c0_s##s_idx * in1.x; \ diff --git a/source/tnn/device/opencl/cl/buffer_to_buffer.cl b/source/tnn/device/opencl/cl/buffer_to_buffer.cl index ca994d4c1..a561ef954 100644 --- a/source/tnn/device/opencl/cl/buffer_to_buffer.cl +++ b/source/tnn/device/opencl/cl/buffer_to_buffer.cl @@ -28,6 +28,60 @@ __kernel void Conv2DFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float output_ptr[output_offset] = (FLOAT)(val); } +// convert kernel : from buffer(mihw) to buffer(ic/4 h w m ic4) +// but now dw only support m == 1 +__kernel void DWFilterBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, + __private const int4 kernel_shape, __private const int height_width_size, + __global FLOAT* output_ptr) { + const int hw_idx = get_global_id(0); + const int ic_block_idx = get_global_id(1); + + DEAL_NON_UNIFORM_DIM2(hw_idx, ic_block_idx); + + FLOAT4 output_values = 0; + if (kernel_shape.x == 1) { + const int input_channel_4_idx = ic_block_idx << 2; + const int buffer_height_idx = hw_idx / kernel_shape.w; + const int buffer_width_idx = hw_idx % kernel_shape.w; + + const int buffer_offset = + mad24(mad24(input_channel_4_idx, kernel_shape.z, buffer_height_idx), kernel_shape.w, buffer_width_idx); + + const int remain_channel = kernel_shape.y - input_channel_4_idx; + if (input_channel_4_idx < kernel_shape.y) { + if (remain_channel >= 4) { + int offset = buffer_offset; + output_values.x = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.y = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.z = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.w = (FLOAT)(*(input_ptr + offset)); + } else if (remain_channel == 3) { + int offset = buffer_offset; + output_values.x = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.y = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.z = (FLOAT)(*(input_ptr + offset)); + + } else if (remain_channel == 2) { + int offset = buffer_offset; + output_values.x = (FLOAT)(*(input_ptr + offset)); + offset += height_width_size; + output_values.y = (FLOAT)(*(input_ptr + offset)); + } else if (remain_channel == 1) { + int offset = buffer_offset; + output_values.x = (FLOAT)(*(input_ptr + offset)); + } + } + const int output_offset = mad24(mad24(ic_block_idx, kernel_shape.z, buffer_height_idx), + kernel_shape.w, buffer_width_idx) << 2; + vstore4(output_values, 0, output_ptr + output_offset); + } +} + // convert arg as 4 alignment __kernel void ArgBufferToBuffer(GLOBAL_SIZE_2_DIMS __global const float *input_ptr, __private const int count, __global FLOAT* output_ptr) { diff --git a/source/tnn/device/opencl/cl/convolution.cl b/source/tnn/device/opencl/cl/convolution.cl index ab6c84ca1..1fc56fb20 100644 --- a/source/tnn/device/opencl/cl/convolution.cl +++ b/source/tnn/device/opencl/cl/convolution.cl @@ -3,8 +3,8 @@ #include "io.inc" __kernel void Conv2D1x1_S1_MIX(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, - __global const FLOAT *weights_ptr, - __global const FLOAT *bias_ptr, + __global const FLOAT16 *weights_ptr, + __global const FLOAT4 *bias_ptr, __write_only image2d_t output, __private const int2 wh, __private const int input_c_blocks, __private const int output_w_updiv_4) { @@ -17,14 +17,11 @@ __kernel void Conv2D1x1_S1_MIX(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, const int output_c_block_idx = output_cw_idx / output_w_updiv_4; const int output_w_block_idx = output_cw_idx % output_w_updiv_4; - FLOAT4 out0 = vload4(output_c_block_idx, (__global FLOAT *)bias_ptr); + FLOAT4 out0 = bias_ptr[output_c_block_idx]; FLOAT4 out1 = out0; FLOAT4 out2 = out0; FLOAT4 out3 = out0; - FLOAT4 in0, in1, in2, in3; - FLOAT4 weights0, weights1, weights2, weights3; - const int out_x_idx = output_w_block_idx << 2; int input_w_idx0 = out_x_idx; @@ -38,25 +35,21 @@ __kernel void Conv2D1x1_S1_MIX(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, input_w_idx3 = select(input_w_idx3, INT_MIN, input_w_idx3 >= wh.x); int input_w_base = 0; - int weights_offset = mul24(output_c_block_idx, input_c_blocks << 2); + int weights_offset = mul24(output_c_block_idx, input_c_blocks); for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) { - in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx)); - in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, bh_idx)); - in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, bh_idx)); - in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, bh_idx)); - - weights0 = vload4(weights_offset, (__global FLOAT *)weights_ptr); - weights1 = vload4(weights_offset + 1, (__global FLOAT *)weights_ptr); - weights2 = vload4(weights_offset + 2, (__global FLOAT *)weights_ptr); - weights3 = vload4(weights_offset + 3, (__global FLOAT *)weights_ptr); + FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx)); + FLOAT4 in1 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx1, bh_idx)); + FLOAT4 in2 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx2, bh_idx)); + FLOAT4 in3 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx3, bh_idx)); - CALCULATE_OUTPUT(0); - CALCULATE_OUTPUT(1); - CALCULATE_OUTPUT(2); - CALCULATE_OUTPUT(3); + FLOAT16 weights = weights_ptr[weights_offset]; + CALCULATE_VEC16_OUTPUT(0); + CALCULATE_VEC16_OUTPUT(1); + CALCULATE_VEC16_OUTPUT(2); + CALCULATE_VEC16_OUTPUT(3); input_w_base += wh.x; - weights_offset += 4; + weights_offset++; } out0 = ActivationProcess(out0); @@ -72,6 +65,112 @@ __kernel void Conv2D1x1_S1_MIX(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, bh_idx, remain); } +__kernel void Conv2D1x1_S1_MIX_WB1(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, + __global const FLOAT16 *weights_ptr, + __global const FLOAT4 *bias_ptr, + __write_only image2d_t output, __private const int2 wh, + __private const int input_c_blocks) { + + const int output_cw_idx = get_global_id(0); //c/4 w + const int bh_idx = get_global_id(1); //b h + + DEAL_NON_UNIFORM_DIM2(output_cw_idx, bh_idx); + + const int output_c_block_idx = output_cw_idx / wh.x; + const int out_x_idx = output_cw_idx % wh.x; + + FLOAT4 out0 = bias_ptr[output_c_block_idx]; + + int input_w_idx0 = out_x_idx; + + input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x); + + int input_w_base = 0; + int weights_offset = mul24(output_c_block_idx, input_c_blocks); + for (int input_c_block_idx = 0; input_c_block_idx < input_c_blocks; ++input_c_block_idx) { + FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx)); + + FLOAT16 weights = weights_ptr[weights_offset]; + + CALCULATE_VEC16_OUTPUT(0); + + input_w_base += wh.x; + weights_offset++; + } + + out0 = ActivationProcess(out0); + + const int out_x_base = mul24(output_c_block_idx, wh.x); + + int output_w_idx = out_x_base + out_x_idx; + WI_F(output, (int2)(output_w_idx, bh_idx), out0); +} + +__kernel void Conv2D1x1_S1_MIX_WB1_Local(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, + __global const FLOAT16 *weights_ptr, + __global const FLOAT4 *bias_ptr, + __write_only image2d_t output, __private const int2 wh, + __private const int input_c_blocks, + __private const int local_block_size, + __local FLOAT4* local_output) { + + const int local_id = get_local_id(0); + const int group_size = get_local_size(0); + const int global_id = get_global_id(0); + const int output_cw_idx = global_id / group_size; //c/4 w + const int bh_idx = get_global_id(1); //b h + + DEAL_NON_UNIFORM_DIM2(global_id, bh_idx); + + const int output_c_block_idx = output_cw_idx / wh.x; + const int out_x_idx = output_cw_idx % wh.x; + + local_output[local_id] = (FLOAT4)0.f; + + int input_w_idx0 = out_x_idx; + + input_w_idx0 = select(input_w_idx0, INT_MIN, input_w_idx0 >= wh.x); + + int pos = local_id; + int input_w_stride = mul24(group_size, wh.x); + int weights_stride = group_size; + int input_w_base = mul24(pos, wh.x); + int weights_offset = mad24(output_c_block_idx, input_c_blocks, pos); + for (unsigned short i = 0; i < local_block_size; i++) { + if (pos >= input_c_blocks) break; + FLOAT4 in0 = RI_F(input, SAMPLER, (int2)(input_w_base + input_w_idx0, bh_idx)); + + FLOAT16 weights = weights_ptr[weights_offset]; + + local_output[local_id] += weights.s0123 * in0.x; + local_output[local_id] += weights.s4567 * in0.y; + local_output[local_id] += weights.s89ab * in0.z; + local_output[local_id] += weights.scdef * in0.w; + + input_w_base += input_w_stride; + weights_offset += weights_stride; + pos += group_size; + } + + barrier(CLK_LOCAL_MEM_FENCE); + for (unsigned short stride = (group_size >> 1); stride > 0; stride >>= 1) { + if (local_id < stride) { + local_output[local_id] += local_output[local_id + stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (local_id == 0) { + local_output[local_id] += bias_ptr[output_c_block_idx]; + local_output[local_id] = ActivationProcess(local_output[local_id]); + + const int out_x_base = mul24(output_c_block_idx, wh.x); + + int output_w_idx = out_x_base + out_x_idx; + WI_F(output, (int2)(output_w_idx, bh_idx), local_output[local_id]); + } +} + __kernel void Conv2D1x1_S1_MIX_CB2(GLOBAL_SIZE_2_DIMS __read_only image2d_t input, __global const FLOAT *weights_ptr, __global const FLOAT *bias_ptr, diff --git a/source/tnn/device/opencl/imagebuffer_convertor.cc b/source/tnn/device/opencl/imagebuffer_convertor.cc index 4da4c05d3..86a55aef7 100644 --- a/source/tnn/device/opencl/imagebuffer_convertor.cc +++ b/source/tnn/device/opencl/imagebuffer_convertor.cc @@ -179,6 +179,8 @@ Status ImageBufferConvertor::ConvertBufferToBuffer(const OpenCLMemory *input, co std::string kernel_name; if (type == CONV2D_FILTER) { kernel_name = "Conv2DFilterBufferToBuffer"; + } else if (type == DW_CONV2D_FILTER) { + kernel_name = "DWFilterBufferToBuffer"; } else if (type == ARGUMENT && dims.size() == 1) { kernel_name = "ArgBufferToBuffer"; } else { @@ -198,6 +200,9 @@ Status ImageBufferConvertor::ConvertBufferToBuffer(const OpenCLMemory *input, co if (type == CONV2D_FILTER) { buffer_to_buffer_unit_.global_work_size.push_back(ROUND_UP(dims[0], 4)); buffer_to_buffer_unit_.global_work_size.push_back(dims[2] * dims[3] * ROUND_UP(dims[1], 4)); + } else if (type == DW_CONV2D_FILTER) { + buffer_to_buffer_unit_.global_work_size.push_back(dims[2] * dims[3]); + buffer_to_buffer_unit_.global_work_size.push_back(UP_DIV(dims[1], 4)); } else if (type == ARGUMENT && dims.size() == 1) { buffer_to_buffer_unit_.global_work_size.push_back(UP_DIV(dims[0], 4)); buffer_to_buffer_unit_.global_work_size.push_back(1); @@ -221,6 +226,12 @@ Status ImageBufferConvertor::ConvertBufferToBuffer(const OpenCLMemory *input, co buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast(dims[1])); buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape); buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast(w_h_size)); + } else if (type == DW_CONV2D_FILTER) { + //height * width + const int w_h_size = dims[2] * dims[3]; + int kernel_shape[4] = {dims[0], dims[1], dims[2], dims[3]}; + buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, sizeof(kernel_shape), kernel_shape); + buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast(w_h_size)); } else if (type == ARGUMENT) { //batch buffer_to_buffer_unit_.ocl_kernel.setArg(idx++, static_cast(dims[0])); diff --git a/source/tnn/device/opencl/opencl_runtime.cc b/source/tnn/device/opencl/opencl_runtime.cc index 6c97dc752..baeebd688 100644 --- a/source/tnn/device/opencl/opencl_runtime.cc +++ b/source/tnn/device/opencl/opencl_runtime.cc @@ -277,13 +277,13 @@ Status OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program //fp16 enable, kernel will use half and read_imageh and write_imageh. LOGD("OpenCL Caucluate Pricision is Half!\n"); build_options_str = - "-DFLOAT=half -DFLOAT4=half4 -DCONVERT_INT=convert_short -DCONVERT_FLOAT4=convert_half4 -DRI_F=read_imageh " + "-DFLOAT=half -DFLOAT4=half4 -DFLOAT16=half16 -DCONVERT_INT=convert_short -DCONVERT_FLOAT4=convert_half4 -DRI_F=read_imageh " "-DWI_F=write_imageh"; } else { //fp16 not enable, kernel will use float and read_imagef and write_imagef. LOGD("OpenCL Caucluate Pricision is Float!\n"); build_options_str = - "-DFLOAT=float -DFLOAT4=float4 -DCONVERT_INT=convert_int -DCONVERT_FLOAT4=convert_float4 -DRI_F=read_imagef " + "-DFLOAT=float -DFLOAT4=float4 -DFLOAT16=float16 -DCONVERT_INT=convert_int -DCONVERT_FLOAT4=convert_float4 -DRI_F=read_imagef " "-DWI_F=write_imagef"; } for (auto &option : build_options) {