From e946449d726682c0bf56aac6f5362698f4144157 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Thu, 19 Nov 2020 19:20:33 +0800 Subject: [PATCH 01/82] startup of dev scatter ops --- oneflow/python/ops/array_ops.py | 22 ++++ .../user/kernels/dim_gather_kernel_util.cpp | 12 -- .../user/kernels/dim_gather_kernel_util.cu | 37 +----- oneflow/user/kernels/dim_gather_kernel_util.h | 56 +-------- oneflow/user/kernels/dim_gather_kernels.cpp | 69 ----------- .../user/kernels/dim_gather_scatter_util.h | 55 +++++++++ .../user/kernels/dim_scatter_kernel_util.cpp | 36 ++++++ .../user/kernels/dim_scatter_kernel_util.cu | 60 ++++++++++ .../user/kernels/dim_scatter_kernel_util.h | 53 +++++++++ oneflow/user/kernels/dim_scatter_kernels.cpp | 83 +++++++++++++ oneflow/user/ops/dim_gather_op.cpp | 77 ------------ oneflow/user/ops/dim_scatter_ops.cpp | 111 ++++++++++++++++++ 12 files changed, 422 insertions(+), 249 deletions(-) create mode 100644 oneflow/user/kernels/dim_gather_scatter_util.h create mode 100644 oneflow/user/kernels/dim_scatter_kernel_util.cpp create mode 100644 oneflow/user/kernels/dim_scatter_kernel_util.cu create mode 100644 oneflow/user/kernels/dim_scatter_kernel_util.h create mode 100644 oneflow/user/kernels/dim_scatter_kernels.cpp create mode 100644 oneflow/user/ops/dim_scatter_ops.cpp diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 55485f6dfff..9925ca69016 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2312,3 +2312,25 @@ def amp_white_identity( .Build() ) return op.InferAndTryRun().SoleOutputBlob() + + +@oneflow_export("dim_scatter") +def dim_scatter( + dim: int, + index: remote_blob_util.BlobDef, + src: remote_blob_util.BlobDef, + name: Optional[str] = None, +) -> remote_blob_util.BlobDef: + return ( + flow.user_op_builder( + name if name is not None else id_util.UniqueStr("DimScatter_") + ) + .Op("dim_scatter") + .Input("input", [src]) + .Input("index", [index]) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] + ) diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cpp b/oneflow/user/kernels/dim_gather_kernel_util.cpp index d12d10a9e67..abb3b50faff 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cpp +++ b/oneflow/user/kernels/dim_gather_kernel_util.cpp @@ -30,20 +30,8 @@ struct DimGatherFunctor final { } }; -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output); - } -}; - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCPU), DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kCPU), - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu index 767023838c5..fa4b7e10ca2 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cu +++ b/oneflow/user/kernels/dim_gather_kernel_util.cu @@ -13,9 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #ifdef WITH_CUDA -#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/util/cuda_kernel_util.h" #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { @@ -53,42 +52,8 @@ struct DimGatherFunctor final { } }; -template -__global__ void DoCUDAScatterDimAdd(const DimOpIndexNdHelper input_nd_helper, - const DimOpIndexNdHelper output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { - DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, - output); -} - -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); - } -}; - -// float16 special case of DimScatterAddFunctor template -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const float16* input, float16* output) { - RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - reinterpret_cast(input), reinterpret_cast(output)); - } -}; - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kGPU), DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kGPU), - DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); - } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.h b/oneflow/user/kernels/dim_gather_kernel_util.h index 3537682113e..f5b4e18e2f7 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.h +++ b/oneflow/user/kernels/dim_gather_kernel_util.h @@ -15,27 +15,10 @@ limitations under the License. */ #ifndef ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ -#ifdef WITH_CUDA -#include "oneflow/core/kernel/util/cuda_kernel_util.h" -#endif // WITH_CUDA -#include "oneflow/core/ndarray/xpu_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/user/kernels/dim_gather_scatter_util.h" namespace oneflow { -#define DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ - FLOATING_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -#define DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ \ - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ - FLOAT16_DATA_TYPE_SEQ - -constexpr int kDimGatherMaxDimCount = 8; - -template -using DimOpIndexNdHelper = NdIndexOffsetHelper; - namespace user_op { template @@ -45,13 +28,6 @@ struct DimGatherFunctor final { int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output); }; -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); -}; - template OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& index_nd_helper, int ndim, @@ -68,41 +44,11 @@ OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper& input_nd_helper } } -template -struct DeviceAdd { - OF_DEVICE_FUNC static void Invoke(const T* x, T* y) { -#ifdef __CUDA_ARCH__ - gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half -#else - *y += *x; -#endif - }; -}; - -template -OF_DEVICE_FUNC void DoDimScatterAdd(const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { - XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { - IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); - coordinate[dim] = index[input_offset]; - - IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - DeviceAdd::Invoke(input + input_offset, output + output_offset); - } -} - // macros for functors instantiate(used by dim_gather_kernel_util.cu and dim_gather_kernel_uti.cpp) #define INSTANTIATE_DIM_GATHER_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ template struct DimGatherFunctor; -#define INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ - template struct DimScatterAddFunctor; - } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index d5616b9673b..ddc8afce08f 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -14,23 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/common/shape_view.h" -#include "oneflow/core/framework/framework.h" #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { namespace user_op { -namespace { - -template -void ConvertShape2Array(const ShapeView& shape_view, IDX_T* array, int64_t num_axis) { - FOR_RANGE(int64_t, i, 0, num_axis) { array[i] = shape_view.At(i); } -} - -} // namespace - template class DimGatherKernel final : public user_op::OpKernel { public: @@ -66,44 +54,6 @@ class DimGatherKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -template -class ScatterDimKernel final : public user_op::OpKernel { - public: - ScatterDimKernel() = default; - ~ScatterDimKernel() override = default; - - private: - void Compute(KernelComputeContext* ctx) const override { - const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); - const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); - Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); - const int32_t dim = ctx->Attr("dim"); - - const IN_T* src = input_tensor->dptr(); - const IDX_T* index = index_tensor->dptr(); - IN_T* output = out_tensor->mut_dptr(); - size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - Memset(ctx->device_ctx(), output, 0, out_bytes_size); - - int ndim = input_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int64_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(input_tensor->shape()); - DimOpIndexNdHelper input_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); - DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - - DimScatterAddFunctor()( - ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, - input_tensor->shape().elem_cnt(), dim, index, src, output); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - #define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_gather") \ .SetCreateFn>() \ @@ -111,13 +61,6 @@ class ScatterDimKernel final : public user_op::OpKernel { & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - #define REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(device) \ REGISTER_DIM_GATHER_KERNEL(device, float, int32_t) \ REGISTER_DIM_GATHER_KERNEL(device, double, int32_t) \ @@ -126,23 +69,11 @@ class ScatterDimKernel final : public user_op::OpKernel { REGISTER_DIM_GATHER_KERNEL(device, double, int64_t) \ REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t) -#define REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(device) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t) - REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kCPU); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kCPU); #ifdef WITH_CUDA REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kGPU); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kGPU); REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t); REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t); #endif // WITH_CUDA diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h new file mode 100644 index 00000000000..f0c451ecb71 --- /dev/null +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -0,0 +1,55 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ +#define ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ +#include "oneflow/core/ndarray/xpu_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/shape_view.h" + +namespace oneflow { + +#define DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ + FLOATING_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +#define DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ \ + DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ + FLOAT16_DATA_TYPE_SEQ + +constexpr int kDimGatherMaxDimCount = 8; + +namespace user_op { + +template +using DimOpIndexNdHelper = NdIndexOffsetHelper; + +template +struct DeviceAdd { + OF_DEVICE_FUNC static void Invoke(const T* x, T* y) { +#ifdef __CUDA_ARCH__ + gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half +#else + *y += *x; +#endif + }; +}; + +} // namespace user_op +} // namespace oneflow + +#endif \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp new file mode 100644 index 00000000000..3bcd196b5f1 --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -0,0 +1,36 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/dim_scatter_kernel_util.h" + +namespace oneflow { +namespace user_op { + +template +struct DimScatterAddFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + input, output); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kCPU), + DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); +} // namespace user_op +} // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu new file mode 100644 index 00000000000..ffdd614c6bd --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -0,0 +1,60 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUDA +#include "oneflow/core/kernel/util/cuda_kernel_util.h" +#include "oneflow/user/kernels/dim_scatter_kernel_util.h" + +namespace oneflow { +namespace user_op { + +template +__global__ void DoCUDAScatterDimAdd(const DimOpIndexNdHelper input_nd_helper, + const DimOpIndexNdHelper output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { + DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, + output); +} + +template +struct DimScatterAddFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); + } +}; + +// float16 special case of DimScatterAddFunctor template +template +struct DimScatterAddFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const float16* input, float16* output) { + RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + reinterpret_cast(input), reinterpret_cast(output)); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kGPU), + DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); + +} // namespace user_op +} // namespace oneflow + +#endif // WITH_CUDA diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h new file mode 100644 index 00000000000..805b89b9b7d --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -0,0 +1,53 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ +#define ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ +#include "oneflow/user/kernels/dim_gather_scatter_util.h" + +namespace oneflow { + +namespace user_op { + +template +struct DimScatterAddFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); +}; + +template +OF_DEVICE_FUNC void DoDimScatterAdd(const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { + XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { + IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); + coordinate[dim] = index[input_offset]; + + IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); + DeviceAdd::Invoke(input + input_offset, output + output_offset); + } +} + +#define INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ + template struct DimScatterAddFunctor; + +} // namespace user_op +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp new file mode 100644 index 00000000000..a9d9b2bc6ed --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -0,0 +1,83 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/dim_scatter_kernel_util.h" + +namespace oneflow { +namespace user_op { + +template +class ScatterDimKernel final : public user_op::OpKernel { + public: + ScatterDimKernel() = default; + ~ScatterDimKernel() override = default; + + private: + void Compute(KernelComputeContext* ctx) const override { + const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); + const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const int32_t dim = ctx->Attr("dim"); + + const IN_T* src = input_tensor->dptr(); + const IDX_T* index = index_tensor->dptr(); + IN_T* output = out_tensor->mut_dptr(); + size_t out_bytes_size = + out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + + int ndim = input_tensor->shape().NumAxes(); + fixed_vector shape_vec(ndim); + auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { + std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), + [](int64_t dim) -> IDX_T { return static_cast(dim); }); + }; + shape2dims(input_tensor->shape()); + DimOpIndexNdHelper input_nd_helper(shape_vec.data(), ndim); + shape2dims(out_tensor->shape()); + DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + DimScatterAddFunctor()( + ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, + input_tensor->shape().elem_cnt(), dim, index, src, output); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_add_like") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +#define REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(device) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t) \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t) \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t) \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t) \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t) + +REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kCPU); + +#ifdef WITH_CUDA +REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kGPU); +REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t); +REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t); +#endif // WITH_CUDA + +} // namespace user_op +} // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 2f940a18193..95bde8b7aea 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -108,83 +108,6 @@ REGISTER_USER_OP("dim_gather") return Maybe::Ok(); }); -REGISTER_USER_OP("dim_scatter_add_like") - .Input("like") - .Input("input") - .Input("index") - .Output("output") - .Attr("dim") - .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { - const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); - const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); - - const Shape& like_shape = like->shape(); - int32_t dim = ctx->Attr("dim"); - - const SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - int64_t split_axis = input_sbp.split_parallel().axis(); - if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { - CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - } - - int64_t input_num_axes = input->shape().NumAxes(); - CHECK_GT_OR_RETURN(input_num_axes, 0); - CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); - - int64_t index_num_axes = index->shape().NumAxes(); - CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - CHECK_EQ_OR_RETURN(input_num_axes, like_shape.NumAxes()); - - FOR_RANGE(int64_t, i, 0, input_num_axes) { - CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); - } - - user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - *out->mut_shape() = like_shape; - *out->mut_data_type() = input->data_type(); - - return Maybe::Ok(); - }) - .SetInputArgModifyFn([](user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); - CHECK(like_arg_modifier != nullptr); - like_arg_modifier->set_use_header_only(true); - like_arg_modifier->set_requires_grad(false); - }) - .SetBatchAxisInferFn([](user_op::BatchAxisContext* ctx) -> Maybe { - CHECK_OR_RETURN(*ctx->BatchAxis4ArgNameAndIndex("index", 0) - == *ctx->BatchAxis4ArgNameAndIndex("input", 0)); - *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *ctx->BatchAxis4ArgNameAndIndex("input", 0); - return Maybe::Ok(); - }) - .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { - const user_op::TensorDesc& index_tensor = - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); - int64_t index_num_axes = index_tensor.shape().NumAxes(); - const int32_t dim = ctx->Attr("dim"); - - FOR_RANGE(int64_t, i, 0, index_num_axes) { - if (i != dim) { - ctx->NewBuilder() - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) - .Split(user_op::OpArg("output", 0), i) - .Split(user_op::OpArg("like", 0), i) - .Build(); - } - } - - ctx->NewBuilder() - .PartialSum(user_op::OpArg("input", 0)) - .Broadcast(user_op::OpArg("index", 0)) - .PartialSum(user_op::OpArg("output", 0)) - .PartialSum(user_op::OpArg("like", 0)) - .Build(); - return Maybe::Ok(); - }); - REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp new file mode 100644 index 00000000000..13ad166a2e8 --- /dev/null +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -0,0 +1,111 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/maybe.h" +#include "oneflow/core/framework/user_op_registry.h" +#include "oneflow/user/kernels/dim_gather_scatter_util.h" + +namespace oneflow { + +namespace user_op { + +namespace { +Maybe InferTensorDesc(user_op::InferContext* ctx) { + const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); + + const Shape& like_shape = like->shape(); + int32_t dim = ctx->Attr("dim"); + + const SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + int64_t split_axis = input_sbp.split_parallel().axis(); + if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { + CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; + } + + int64_t input_num_axes = input->shape().NumAxes(); + CHECK_GT_OR_RETURN(input_num_axes, 0); + CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); + + int64_t index_num_axes = index->shape().NumAxes(); + CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); + CHECK_EQ_OR_RETURN(input_num_axes, like_shape.NumAxes()); + + FOR_RANGE(int64_t, i, 0, input_num_axes) { + CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); + } + + user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); + *out->mut_shape() = like_shape; + *out->mut_data_type() = input->data_type(); + + return Maybe::Ok(); +} + +Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); + CHECK(like_arg_modifier != nullptr); + like_arg_modifier->set_use_header_only(true); + like_arg_modifier->set_requires_grad(false); + return Maybe::Ok(); +} + +Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { + CHECK_OR_RETURN(*ctx->BatchAxis4ArgNameAndIndex("index", 0) + == *ctx->BatchAxis4ArgNameAndIndex("input", 0)); + *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *ctx->BatchAxis4ArgNameAndIndex("input", 0); + return Maybe::Ok(); +} + +Maybe SetSbp(user_op::SbpContext* ctx) { + const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); + int64_t index_num_axes = index_tensor.shape().NumAxes(); + const int32_t dim = ctx->Attr("dim"); + + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if (i != dim) { + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Split(user_op::OpArg("like", 0), i) + .Build(); + } + } + + ctx->NewBuilder() + .PartialSum(user_op::OpArg("input", 0)) + .Broadcast(user_op::OpArg("index", 0)) + .PartialSum(user_op::OpArg("output", 0)) + .PartialSum(user_op::OpArg("like", 0)) + .Build(); + return Maybe::Ok(); +} +} // namespace +REGISTER_USER_OP("dim_scatter_add_like") + .Input("like") + .Input("input") + .Input("index") + .Output("output") + .Attr("dim") + .SetTensorDescInferFn(InferTensorDesc) + .SetInputArgModifyFn(InputArgModifierFn) + .SetBatchAxisInferFn(InferBatchAxis) + .SetGetSbpFn(SetSbp); + +} // namespace user_op +} // namespace oneflow \ No newline at end of file From bf505f8e2f3349ef4a56c250171410c161aed3f3 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Fri, 20 Nov 2020 11:09:02 +0800 Subject: [PATCH 02/82] use dim scatter base class --- oneflow/user/kernels/dim_scatter_kernels.cpp | 37 +++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index a9d9b2bc6ed..0756663e1cc 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -13,17 +13,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { namespace user_op { template -class ScatterDimKernel final : public user_op::OpKernel { +class DimScatterBaseKernel: public user_op::OpKernel { public: - ScatterDimKernel() = default; - ~ScatterDimKernel() override = default; - + DimScatterBaseKernel() = default; + ~DimScatterBaseKernel() override = default; + virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const{ + UNIMPLEMENTED(); + } private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); @@ -49,13 +54,35 @@ class ScatterDimKernel final : public user_op::OpKernel { shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - DimScatterAddFunctor()( + BinaryOp( ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, input_tensor->shape().elem_cnt(), dim, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; +template +class ScatterDimKernel final : public DimScatterBaseKernel { + public: + ScatterDimKernel() = default; + ~ScatterDimKernel() override = default; + + private: + void BinaryOp(DeviceCtx* ctx, + const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, + int ndim, int64_t elem_cnt, + int32_t dim, + const IDX_T* index, + const IN_T* src, + IN_T* output) const override{ + DimScatterAddFunctor()( + ctx, input_nd_helper, output_nd_helper, ndim, + elem_cnt, dim, index, src, output); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + #define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add_like") \ .SetCreateFn>() \ From a6d28b9a8c445dd65c6c070e186899315e44471b Mon Sep 17 00:00:00 2001 From: doombeaker Date: Fri, 20 Nov 2020 12:49:32 +0800 Subject: [PATCH 03/82] refine(using binop to abstract scatter update and add --- .../user/kernels/dim_gather_scatter_util.h | 14 ++++-- .../user/kernels/dim_scatter_kernel_util.cpp | 19 +++++++- .../user/kernels/dim_scatter_kernel_util.cu | 47 ++++++++++++++++--- .../user/kernels/dim_scatter_kernel_util.h | 17 +++++-- oneflow/user/kernels/dim_scatter_kernels.cpp | 36 +++++++++++--- 5 files changed, 112 insertions(+), 21 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index f0c451ecb71..9f875744a5d 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -38,17 +38,25 @@ namespace user_op { template using DimOpIndexNdHelper = NdIndexOffsetHelper; +template +using BinaryOpFn = void(*)(const T* x, T* y); + template -struct DeviceAdd { - OF_DEVICE_FUNC static void Invoke(const T* x, T* y) { +struct DeviceBinOp { + OF_DEVICE_FUNC static void Add(const T* x, T* y) { #ifdef __CUDA_ARCH__ gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half #else *y += *x; #endif - }; + } + + OF_DEVICE_FUNC static void Update(const T* x, T* y) { + *y = *x; + } }; + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 3bcd196b5f1..74dd4987a02 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/dim_gather_scatter_util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { @@ -25,12 +26,26 @@ struct DimScatterAddFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output); + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + input, output, + DeviceBinOp::Add); + } +}; + +template +struct DimScatterUpdateFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + input, output, + DeviceBinOp::Update); } }; OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kCPU), DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR, (DeviceType::kCPU), + DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); } // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index ffdd614c6bd..54d407a4bda 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -21,12 +21,13 @@ namespace oneflow { namespace user_op { template -__global__ void DoCUDAScatterDimAdd(const DimOpIndexNdHelper input_nd_helper, +__global__ void DoCUDADimScatterAdd(const DimOpIndexNdHelper input_nd_helper, const DimOpIndexNdHelper output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimScatterAdd(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, - output); + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + input, output, + DeviceBinOp::Add); } template @@ -34,7 +35,7 @@ struct DimScatterAddFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), + RUN_CUDA_KERNEL((DoCUDADimScatterAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); } }; @@ -45,7 +46,40 @@ struct DimScatterAddFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, const float16* input, float16* output) { - RUN_CUDA_KERNEL((DoCUDAScatterDimAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), + RUN_CUDA_KERNEL((DoCUDADimScatterAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + reinterpret_cast(input), reinterpret_cast(output)); + } +}; + + +template +__global__ void DoCUDADimScatterUpdate(const DimOpIndexNdHelper input_nd_helper, + const DimOpIndexNdHelper output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, + input, output, + DeviceBinOp::Update); +} + +template +struct DimScatterUpdateFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimScatterUpdate), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); + } +}; + +// float16 special case of DimScatterAddFunctor template +template +struct DimScatterUpdateFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const float16* input, float16* output) { + RUN_CUDA_KERNEL((DoCUDADimScatterUpdate), ctx, BlocksNum4ThreadsNum(elem_cnt), input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, reinterpret_cast(input), reinterpret_cast(output)); } @@ -53,7 +87,8 @@ struct DimScatterAddFunctor final { OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kGPU), DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); - +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR, (DeviceType::kGPU), + DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 805b89b9b7d..dbfe68bd798 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -28,24 +28,35 @@ struct DimScatterAddFunctor final { int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); }; +template +struct DimScatterUpdateFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); +}; + template -OF_DEVICE_FUNC void DoDimScatterAdd(const DimOpIndexNdHelper& input_nd_helper, +OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { + const IN_T* input, IN_T* output, + BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); coordinate[dim] = index[input_offset]; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - DeviceAdd::Invoke(input + input_offset, output + output_offset); + bin_op(input + input_offset, output + output_offset); } } #define INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ template struct DimScatterAddFunctor; +#define INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ + template struct DimScatterUpdateFunctor; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 0756663e1cc..7a65a383ff5 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -26,9 +26,7 @@ class DimScatterBaseKernel: public user_op::OpKernel { ~DimScatterBaseKernel() override = default; virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const{ - UNIMPLEMENTED(); - } + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const = 0; private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); @@ -62,10 +60,10 @@ class DimScatterBaseKernel: public user_op::OpKernel { }; template -class ScatterDimKernel final : public DimScatterBaseKernel { +class DimScatterAddKernel final : public DimScatterBaseKernel { public: - ScatterDimKernel() = default; - ~ScatterDimKernel() override = default; + DimScatterAddKernel() = default; + ~DimScatterAddKernel() override = default; private: void BinaryOp(DeviceCtx* ctx, @@ -76,6 +74,7 @@ class ScatterDimKernel final : public DimScatterBaseKernel()( ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, output); @@ -83,9 +82,32 @@ class ScatterDimKernel final : public DimScatterBaseKernel +class DimScatterUpdateKernel final : public DimScatterBaseKernel { + public: + DimScatterUpdateKernel() = default; + ~DimScatterUpdateKernel() override = default; + + private: + void BinaryOp(DeviceCtx* ctx, + const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, + int ndim, int64_t elem_cnt, + int32_t dim, + const IDX_T* index, + const IN_T* src, + IN_T* output) const override{ + + DimScatterUpdateFunctor()( + ctx, input_nd_helper, output_nd_helper, ndim, + elem_cnt, dim, index, src, output); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + #define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); From bb05c9b30f5fc6c937f182d141582387f1ee656e Mon Sep 17 00:00:00 2001 From: doombeaker Date: Fri, 20 Nov 2020 13:53:54 +0800 Subject: [PATCH 04/82] refine (use macros to implement kerenl class and functors) --- .../user/kernels/dim_gather_scatter_util.h | 56 +++++++++++-- .../user/kernels/dim_scatter_kernel_util.cpp | 23 +----- .../user/kernels/dim_scatter_kernel_util.cu | 66 +--------------- .../user/kernels/dim_scatter_kernel_util.h | 24 ++---- oneflow/user/kernels/dim_scatter_kernels.cpp | 79 +++++++------------ 5 files changed, 88 insertions(+), 160 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 9f875744a5d..51e34cf0718 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -38,8 +38,8 @@ namespace user_op { template using DimOpIndexNdHelper = NdIndexOffsetHelper; -template -using BinaryOpFn = void(*)(const T* x, T* y); +template +using BinaryOpFn = void (*)(const T* x, T* y); template struct DeviceBinOp { @@ -51,11 +51,57 @@ struct DeviceBinOp { #endif } - OF_DEVICE_FUNC static void Update(const T* x, T* y) { - *y = *x; - } + OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } }; +#define DECLARE_DIMSCATTER_FUNCTOR(binop) \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); \ + } + +#define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ + index, input, output, DeviceBinOp::binop); \ + } \ + } + +#define IMPLEMENT_DIMSCATTER_GPUFUNCTOR(binop) \ + template \ + __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper input_nd_helper, \ + const DimOpIndexNdHelper output_nd_helper, \ + int ndim, int64_t elem_cnt, int32_t dim, \ + const IDX_T* index, const IN_T* input, IN_T* output) { \ + DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + input, output, DeviceBinOp::binop); \ + } \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ + RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, \ + output); \ + } \ + }; \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const float16* input, float16* output) { \ + RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ + input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + reinterpret_cast(input), reinterpret_cast(output)); \ + } \ + } } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 74dd4987a02..14cdfeb8f14 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -21,27 +21,8 @@ limitations under the License. namespace oneflow { namespace user_op { -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output, - DeviceBinOp::Add); - } -}; - -template -struct DimScatterUpdateFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output, - DeviceBinOp::Update); - } -}; +IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Add); +IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kCPU), DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 54d407a4bda..5337f6fc5a8 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -20,70 +20,8 @@ limitations under the License. namespace oneflow { namespace user_op { -template -__global__ void DoCUDADimScatterAdd(const DimOpIndexNdHelper input_nd_helper, - const DimOpIndexNdHelper output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output, - DeviceBinOp::Add); -} - -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); - } -}; - -// float16 special case of DimScatterAddFunctor template -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const float16* input, float16* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterAdd), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - reinterpret_cast(input), reinterpret_cast(output)); - } -}; - - -template -__global__ void DoCUDADimScatterUpdate(const DimOpIndexNdHelper input_nd_helper, - const DimOpIndexNdHelper output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - input, output, - DeviceBinOp::Update); -} - -template -struct DimScatterUpdateFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterUpdate), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); - } -}; - -// float16 special case of DimScatterAddFunctor template -template -struct DimScatterUpdateFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const float16* input, float16* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterUpdate), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, - reinterpret_cast(input), reinterpret_cast(output)); - } -}; +IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Add); +IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Update); OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kGPU), DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index dbfe68bd798..ffa650231e1 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -21,26 +21,14 @@ namespace oneflow { namespace user_op { -template -struct DimScatterAddFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); -}; - -template -struct DimScatterUpdateFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); -}; +DECLARE_DIMSCATTER_FUNCTOR(Add); +DECLARE_DIMSCATTER_FUNCTOR(Update); template OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output, - BinaryOpFn bin_op) { + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); @@ -56,7 +44,7 @@ OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& input_nd_ OF_PP_PAIR_FIRST(itype_pair)>; #define INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ template struct DimScatterUpdateFunctor; + OF_PP_PAIR_FIRST(itype_pair)>; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 7a65a383ff5..3d79825ed9d 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -19,14 +19,33 @@ limitations under the License. namespace oneflow { namespace user_op { +#define IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop) \ + template \ + class DimScatter##binop##Kernel final : public DimScatterBaseKernel { \ + public: \ + DimScatter##binop##Kernel() = default; \ + ~DimScatter##binop##Kernel() override = default; \ + \ + private: \ + void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const override { \ + DimScatter##binop##Functor()( \ + ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, output); \ + } \ + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ + } + template -class DimScatterBaseKernel: public user_op::OpKernel { +class DimScatterBaseKernel : public user_op::OpKernel { public: DimScatterBaseKernel() = default; ~DimScatterBaseKernel() override = default; virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const = 0; + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* src, + IN_T* output) const = 0; + private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); @@ -52,62 +71,18 @@ class DimScatterBaseKernel: public user_op::OpKernel { shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - BinaryOp( - ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, - input_tensor->shape().elem_cnt(), dim, index, src, output); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class DimScatterAddKernel final : public DimScatterBaseKernel { - public: - DimScatterAddKernel() = default; - ~DimScatterAddKernel() override = default; - - private: - void BinaryOp(DeviceCtx* ctx, - const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, - int ndim, int64_t elem_cnt, - int32_t dim, - const IDX_T* index, - const IN_T* src, - IN_T* output) const override{ - - DimScatterAddFunctor()( - ctx, input_nd_helper, output_nd_helper, ndim, - elem_cnt, dim, index, src, output); + BinaryOp(ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, + input_tensor->shape().elem_cnt(), dim, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -template -class DimScatterUpdateKernel final : public DimScatterBaseKernel { - public: - DimScatterUpdateKernel() = default; - ~DimScatterUpdateKernel() override = default; - - private: - void BinaryOp(DeviceCtx* ctx, - const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, - int ndim, int64_t elem_cnt, - int32_t dim, - const IDX_T* index, - const IN_T* src, - IN_T* output) const override{ - - DimScatterUpdateFunctor()( - ctx, input_nd_helper, output_nd_helper, ndim, - elem_cnt, dim, index, src, output); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; +IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); +IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); #define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); From cc4ad5a0edd98b6c010138116a3a44d5219229e9 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Fri, 20 Nov 2020 15:35:33 +0800 Subject: [PATCH 05/82] refine(description for register scatter ops/kernels) --- oneflow/python/ops/array_ops.py | 34 +++++++++-- .../user/kernels/dim_gather_scatter_util.h | 14 +++++ oneflow/user/kernels/dim_scatter_kernels.cpp | 61 +++++++++++-------- oneflow/user/ops/dim_scatter_ops.cpp | 11 ++++ 4 files changed, 90 insertions(+), 30 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 9925ca69016..4b2cab1d702 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2314,20 +2314,46 @@ def amp_white_identity( return op.InferAndTryRun().SoleOutputBlob() -@oneflow_export("dim_scatter") -def dim_scatter( +@oneflow_export("dim_scatter_update_like") +def dim_scatter_update( dim: int, index: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, + like: remote_blob_util.BlobDef, + name: Optional[str] = None, +) -> remote_blob_util.BlobDef: + return ( + flow.user_op_builder( + name if name is not None else id_util.UniqueStr("DimScatterUpdateLike_") + ) + .Op("dim_scatter_update_like") + .Input("input", [src]) + .Input("index", [index]) + .Input("like", [like]) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] + ) + + +@oneflow_export("dim_scatter_add_like") +def dim_scatter_add( + dim: int, + index: remote_blob_util.BlobDef, + src: remote_blob_util.BlobDef, + like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: return ( flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatter_") + name if name is not None else id_util.UniqueStr("DimScatterAddLike_") ) - .Op("dim_scatter") + .Op("dim_scatter_add_like") .Input("input", [src]) .Input("index", [index]) + .Input("like", [like]) .Output("output") .Attr("dim", int(dim)) .Build() diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 51e34cf0718..4ff0338870e 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -41,6 +41,20 @@ using DimOpIndexNdHelper = NdIndexOffsetHelper; template using BinaryOpFn = void (*)(const T* x, T* y); +// Steps for adding a binary operation on scatter are as follows: +// 1. implment binop in DeviceBinOp, for example "Mul": +// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } +// 2. Implement and register kernels in dim_scatter_kernels.cpp: +// IMPLEMENT_AND_REGISTER_KERNEL("scatter_mul_like", Mul); +// 3. Declare Functor in dim_scatter_kernel_util.h: +// DECLARE_DIMSCATTER_FUNCTOR(Mul); +// 4. Implement functors in dim_scatter_kernel_util.cu and cpp file: +// in .cu file: +// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); +// in .cpp file: +// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Mul); +// + template struct DeviceBinOp { OF_DEVICE_FUNC static void Add(const T* x, T* y) { diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 3d79825ed9d..6571124129b 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -36,6 +36,38 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } +#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +#define REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t, optypename, binop) + +#define REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(DeviceType::kCPU, optypename, binop); + +#ifdef WITH_CUDA +#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#else +#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) +#endif // WITH_CUDA + +#define IMPLEMENT_AND_REGISTER_KERNEL(optypename, binop) \ + IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop); \ + REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop); + template class DimScatterBaseKernel : public user_op::OpKernel { public: @@ -77,31 +109,8 @@ class DimScatterBaseKernel : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); -IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); - -#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -#define REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(device) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t) - -REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kCPU); - -#ifdef WITH_CUDA -REGISTER_DIM_SCATTER_ADD_LIKE_KERNELS_WITH_DEVICE(DeviceType::kGPU); -REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t); -#endif // WITH_CUDA +IMPLEMENT_AND_REGISTER_KERNEL("dim_scatter_add_like", Add); +IMPLEMENT_AND_REGISTER_KERNEL("dim_scatter_update_like", Update); } // namespace user_op -} // namespace oneflow \ No newline at end of file +} // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 13ad166a2e8..d3fffc62ef9 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -107,5 +107,16 @@ REGISTER_USER_OP("dim_scatter_add_like") .SetBatchAxisInferFn(InferBatchAxis) .SetGetSbpFn(SetSbp); +REGISTER_USER_OP("dim_scatter_update_like") + .Input("like") + .Input("input") + .Input("index") + .Output("output") + .Attr("dim") + .SetTensorDescInferFn(InferTensorDesc) + .SetInputArgModifyFn(InputArgModifierFn) + .SetBatchAxisInferFn(InferBatchAxis) + .SetGetSbpFn(SetSbp); + } // namespace user_op } // namespace oneflow \ No newline at end of file From 645a19ef3a6a71282e20d68ea7f800741a7aba44 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Fri, 20 Nov 2020 19:24:21 +0800 Subject: [PATCH 06/82] refine --- .../user/kernels/dim_gather_scatter_util.h | 2 ++ .../user/kernels/dim_scatter_kernel_util.cpp | 6 ++-- .../user/kernels/dim_scatter_kernel_util.cu | 7 ++--- .../user/kernels/dim_scatter_kernel_util.h | 28 +++++++++++++++---- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 4ff0338870e..3abba043265 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -51,8 +51,10 @@ using BinaryOpFn = void (*)(const T* x, T* y); // 4. Implement functors in dim_scatter_kernel_util.cu and cpp file: // in .cu file: // IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); +// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Mul); // in .cpp file: // IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Mul); +// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Mul); // template diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 14cdfeb8f14..577416c1ce0 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -24,9 +24,7 @@ namespace user_op { IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Add); IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kCPU), - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR, (DeviceType::kCPU), - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); +INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Add); +INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Update); } // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 5337f6fc5a8..f2df4fbb082 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -23,10 +23,9 @@ namespace user_op { IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Add); IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Update); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR, (DeviceType::kGPU), - DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR, (DeviceType::kGPU), - DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); +INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Add); +INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Update); + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index ffa650231e1..1911f8fdf71 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -39,12 +39,28 @@ OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& input_nd_ } } -#define INSTANTIATE_DIM_SCATTER_ADD_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ - template struct DimScatterAddFunctor; -#define INSTANTIATE_DIM_SCATTER_UPDATE_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ - template struct DimScatterUpdateFunctor; +#define INSTANTIATE_DIM_SCATTER_FUNCTOR(devicetype, dtype, itype, binop) \ + template struct DimScatter##binop##Functor; + +#define INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, int32_t, int32_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float, int32_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, double, int32_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float16, int32_t, binop) \ + \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, int32_t, int64_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float, int64_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, double, int64_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float16, int64_t, binop) + +#define INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, int32_t, int32_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, float, int32_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, double, int32_t, binop) \ + \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, int32_t, int64_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, float, int64_t, binop) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, double, int64_t, binop) } // namespace user_op } // namespace oneflow From 2b0b14614ee018960ab6bc42cb7eaf74dbddfe27 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Sat, 21 Nov 2020 17:17:58 +0800 Subject: [PATCH 07/82] add inplace ops --- .../user/kernels/dim_gather_scatter_util.h | 14 ++- oneflow/user/kernels/dim_scatter_kernels.cpp | 85 ++++++++++++++----- oneflow/user/ops/dim_scatter_ops.cpp | 20 +++++ 3 files changed, 94 insertions(+), 25 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 3abba043265..4a25d74b7ab 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -44,11 +44,17 @@ using BinaryOpFn = void (*)(const T* x, T* y); // Steps for adding a binary operation on scatter are as follows: // 1. implment binop in DeviceBinOp, for example "Mul": // OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } -// 2. Implement and register kernels in dim_scatter_kernels.cpp: -// IMPLEMENT_AND_REGISTER_KERNEL("scatter_mul_like", Mul); -// 3. Declare Functor in dim_scatter_kernel_util.h: +// +// 2. Implement kernels in dim_scatter_kernels.cpp: +// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Mul); +// +// 3. Register kernels +// REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_add_like", Add); +// +// 4. Declare Functor in dim_scatter_kernel_util.h: // DECLARE_DIMSCATTER_FUNCTOR(Mul); -// 4. Implement functors in dim_scatter_kernel_util.cu and cpp file: +// +// 5. Implement functors in dim_scatter_kernel_util.cu and cpp file: // in .cu file: // IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); // INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Mul); diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 6571124129b..bcdfb9acfc8 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -36,37 +36,76 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } -#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t, optypename, binop) +#define REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) -#define REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(DeviceType::kCPU, optypename, binop); +#define REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_KERNELS_WITH_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else -#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) +#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define IMPLEMENT_AND_REGISTER_KERNEL(optypename, binop) \ - IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop); \ - REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop); +#define REGISTER_SCATTER_OUTPLACE_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop); + +// ---- REGISTER INPLACE OPS ---- +Maybe SetInplace(const user_op::InferContext&, + user_op::AddInplaceArgPair AddInplaceArgPairFn){ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "like", 0, true)); + return Maybe::Ok(); +} + +#define REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value))\ + .SetInplaceProposalFn(SetInplace); + + +#define REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) + +#define REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); + +#ifdef WITH_CUDA +#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#else +#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) +#endif // WITH_CUDA + +#define REGISTER_SCATTER_INTPLACE_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop); template class DimScatterBaseKernel : public user_op::OpKernel { @@ -109,8 +148,12 @@ class DimScatterBaseKernel : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -IMPLEMENT_AND_REGISTER_KERNEL("dim_scatter_add_like", Add); -IMPLEMENT_AND_REGISTER_KERNEL("dim_scatter_update_like", Update); +IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); +IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); + +REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_add_like", Add); +REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_update_like", Update); +REGISTER_SCATTER_INTPLACE_KERNEL("dim_scatter_add", Add); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index d3fffc62ef9..63aed0f906f 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -64,6 +64,15 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF return Maybe::Ok(); } +Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); + CHECK(like_arg_modifier != nullptr); + like_arg_modifier->set_use_header_only(false); + like_arg_modifier->set_requires_grad(false); + return Maybe::Ok(); +} + Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { CHECK_OR_RETURN(*ctx->BatchAxis4ArgNameAndIndex("index", 0) == *ctx->BatchAxis4ArgNameAndIndex("input", 0)); @@ -118,5 +127,16 @@ REGISTER_USER_OP("dim_scatter_update_like") .SetBatchAxisInferFn(InferBatchAxis) .SetGetSbpFn(SetSbp); +REGISTER_USER_OP("dim_scatter_add") //inplace + .Input("like") + .Input("input") + .Input("index") + .Output("output") + .Attr("dim") + .SetTensorDescInferFn(InferTensorDesc) + .SetInputArgModifyFn(InplaceInputArgModifierFn) + .SetBatchAxisInferFn(InferBatchAxis) + .SetGetSbpFn(SetSbp); + } // namespace user_op } // namespace oneflow \ No newline at end of file From 9081a0a2392ec850360eca8ac09f79aea27473df Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 23 Nov 2020 11:52:02 +0800 Subject: [PATCH 08/82] python wraper scatter_add inplace --- oneflow/python/ops/array_ops.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 4b2cab1d702..d2c8df2440f 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2360,3 +2360,26 @@ def dim_scatter_add( .InferAndTryRun() .RemoteBlobList()[0] ) + +@oneflow_export("dim_scatter_add") +def dim_scatter_add( + dim: int, + index: remote_blob_util.BlobDef, + src: remote_blob_util.BlobDef, + like: remote_blob_util.BlobDef, + name: Optional[str] = None, +) -> remote_blob_util.BlobDef: + return ( + flow.user_op_builder( + name if name is not None else id_util.UniqueStr("DimScatterAddLike_") + ) + .Op("dim_scatter_add") + .Input("input", [src]) + .Input("index", [index]) + .Input("like", [like]) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] + ) From 24fe455c3c94c387d636c93ab9d9c2c4a36d87cf Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 24 Nov 2020 14:35:36 +0800 Subject: [PATCH 09/82] dev inplace ops --- oneflow/python/ops/array_ops.py | 4 +- oneflow/user/kernels/dim_scatter_kernels.cpp | 12 +++- oneflow/user/ops/dim_scatter_ops.cpp | 61 +++++++++----------- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index d2c8df2440f..aa048b6eb9b 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2315,7 +2315,7 @@ def amp_white_identity( @oneflow_export("dim_scatter_update_like") -def dim_scatter_update( +def dim_scatter_update_like( dim: int, index: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, @@ -2339,7 +2339,7 @@ def dim_scatter_update( @oneflow_export("dim_scatter_add_like") -def dim_scatter_add( +def dim_scatter_add_like( dim: int, index: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index bcdfb9acfc8..accb198c5b1 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -129,7 +129,17 @@ class DimScatterBaseKernel : public user_op::OpKernel { IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - Memset(ctx->device_ctx(), output, 0, out_bytes_size); + + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); + if(!like_tensor->dptr()){ + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + }else { + const IN_T* like = like_tensor->dptr(); + if(output != like) + { + Memcpy(ctx->device_ctx(), output, like_tensor->dptr(), out_bytes_size); + } + } int ndim = input_tensor->shape().NumAxes(); fixed_vector shape_vec(ndim); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 63aed0f906f..99a20634436 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -68,7 +68,6 @@ Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgMo const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); - like_arg_modifier->set_use_header_only(false); like_arg_modifier->set_requires_grad(false); return Maybe::Ok(); } @@ -105,38 +104,34 @@ Maybe SetSbp(user_op::SbpContext* ctx) { return Maybe::Ok(); } } // namespace -REGISTER_USER_OP("dim_scatter_add_like") - .Input("like") - .Input("input") - .Input("index") - .Output("output") - .Attr("dim") - .SetTensorDescInferFn(InferTensorDesc) - .SetInputArgModifyFn(InputArgModifierFn) - .SetBatchAxisInferFn(InferBatchAxis) - .SetGetSbpFn(SetSbp); - -REGISTER_USER_OP("dim_scatter_update_like") - .Input("like") - .Input("input") - .Input("index") - .Output("output") - .Attr("dim") - .SetTensorDescInferFn(InferTensorDesc) - .SetInputArgModifyFn(InputArgModifierFn) - .SetBatchAxisInferFn(InferBatchAxis) - .SetGetSbpFn(SetSbp); - -REGISTER_USER_OP("dim_scatter_add") //inplace - .Input("like") - .Input("input") - .Input("index") - .Output("output") - .Attr("dim") - .SetTensorDescInferFn(InferTensorDesc) - .SetInputArgModifyFn(InplaceInputArgModifierFn) - .SetBatchAxisInferFn(InferBatchAxis) - .SetGetSbpFn(SetSbp); + +#define REGISTER_SCATTER_LIKE_OP(optypename) \ +REGISTER_USER_OP(optypename) \ + .Input("like") \ + .Input("input") \ + .Input("index") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferTensorDesc) \ + .SetInputArgModifyFn(InputArgModifierFn) \ + .SetBatchAxisInferFn(InferBatchAxis) \ + .SetGetSbpFn(SetSbp) + +#define REGISTER_SCATTER_INPLACE_OP(optypename) \ +REGISTER_USER_OP(optypename) \ + .Input("like") \ + .Input("input") \ + .Input("index") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferTensorDesc) \ + .SetInputArgModifyFn(InplaceInputArgModifierFn) \ + .SetBatchAxisInferFn(InferBatchAxis) \ + .SetGetSbpFn(SetSbp) + +REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); +REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); +REGISTER_SCATTER_INPLACE_OP("dim_scatter_add"); } // namespace user_op } // namespace oneflow \ No newline at end of file From 60b1fe959a76c0fc0936776f715834891fe339db Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 24 Nov 2020 21:33:56 +0800 Subject: [PATCH 10/82] refine dim_gather (using macros register mechanism) --- oneflow/python/ops/array_ops.py | 1 + .../user/kernels/dim_gather_kernel_util.cpp | 17 +- .../user/kernels/dim_gather_kernel_util.cu | 39 +--- oneflow/user/kernels/dim_gather_kernel_util.h | 66 +++++-- oneflow/user/kernels/dim_gather_kernels.cpp | 89 ++++++--- .../user/kernels/dim_gather_scatter_util.h | 81 +++++--- .../user/kernels/dim_scatter_kernel_util.h | 22 +++ oneflow/user/kernels/dim_scatter_kernels.cpp | 87 ++++----- oneflow/user/ops/dim_gather_op.cpp | 178 +++++++++--------- oneflow/user/ops/dim_scatter_ops.cpp | 48 ++--- 10 files changed, 358 insertions(+), 270 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index aa048b6eb9b..8b1f85b5dd2 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2361,6 +2361,7 @@ def dim_scatter_add_like( .RemoteBlobList()[0] ) + @oneflow_export("dim_scatter_add") def dim_scatter_add( dim: int, diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cpp b/oneflow/user/kernels/dim_gather_kernel_util.cpp index abb3b50faff..be8fde77d04 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cpp +++ b/oneflow/user/kernels/dim_gather_kernel_util.cpp @@ -20,18 +20,11 @@ namespace oneflow { namespace user_op { -template -struct DimGatherFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, - output); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCPU), - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); +IMPLEMENT_DIMGATHER_CPUFUNCTOR(Update); +INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Update); + +IMPLEMENT_DIMGATHER_CPUFUNCTOR(Add); +INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Add); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu index fa4b7e10ca2..c60421110f2 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cu +++ b/oneflow/user/kernels/dim_gather_kernel_util.cu @@ -21,39 +21,12 @@ namespace oneflow { namespace user_op { -template -__global__ void DoCUDADimGather(const DimOpIndexNdHelper input_nd_helper, - const DimOpIndexNdHelper index_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { - DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, - output); -} - -template -struct DimGatherFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimGather), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, output); - } -}; - -// float16 special case of DimGatherFunctor template -template -struct DimGatherFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const float16* input, float16* output) { - RUN_CUDA_KERNEL((DoCUDADimGather), ctx, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, - reinterpret_cast(input), reinterpret_cast(output)); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kGPU), - DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); +IMPLEMENT_DIMGATHER_GPUFUNCTOR(Update); +INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Update); + +IMPLEMENT_DIMGATHER_GPUFUNCTOR(Add); +INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Add); + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.h b/oneflow/user/kernels/dim_gather_kernel_util.h index f5b4e18e2f7..6de537f3c5c 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.h +++ b/oneflow/user/kernels/dim_gather_kernel_util.h @@ -17,22 +17,38 @@ limitations under the License. #define ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ #include "oneflow/user/kernels/dim_gather_scatter_util.h" -namespace oneflow { +// Steps for adding a binary operation on gathers are as follows: +// 1. implment binop in DeviceBinOp, for example "Mul": +// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } +// +// 2. Declare Functor in dim_gather_kernel_util.h: +// DECLARE_DIMGATHER_FUNCTOR(Mul); +// +// 3. Implement functors in dim_gather_kernel_util.cu and cpp file: +// in .cu file: +// IMPLEMENT_DIMGATHER_GPUFUNCTOR(Mul); +// INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Mul); +// in .cpp file: +// IMPLEMENT_DIMGATHER_CPUFUNCTOR(Mul); +// INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Mul); +// +// 4. Implement kernels in dim_gather_kernels.cpp: +// IMPLEMENT_DIMGATHER_KERNEL_CLASS(Mul); +// +// 5. Register kernels +// REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather_mul_like", Mul); +namespace oneflow { namespace user_op { -template -struct DimGatherFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output); -}; +DECLARE_DIMGATHER_FUNCTOR(Update); +DECLARE_DIMGATHER_FUNCTOR(Add); template -OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output) { +OF_DEVICE_FUNC void DoDimGatherBinop(const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(index_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; const IDX_T x = index[index_offset]; @@ -40,14 +56,32 @@ OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper& input_nd_helper coordinate[dim] = x; IDX_T input_offset = input_nd_helper.NdIndexToOffset(coordinate, ndim); - output[index_offset] = input[input_offset]; + bin_op(input + input_offset, output + index_offset); } } -// macros for functors instantiate(used by dim_gather_kernel_util.cu and dim_gather_kernel_uti.cpp) -#define INSTANTIATE_DIM_GATHER_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ - template struct DimGatherFunctor; +#define INSTANTIATE_DIM_GATHER_FUNCTOR(devicetype, dtype, itype, binop) \ + template struct DimGather##binop##Functor; + +#define INSTANTIATE_DIM_GATHER_GPUFUNCTORS(binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, int32_t, int32_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float, int32_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, double, int32_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float16, int32_t, binop) \ + \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, int32_t, int64_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float, int64_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, double, int64_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float16, int64_t, binop) + +#define INSTANTIATE_DIM_GATHER_CPUFUNCTORS(binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, int32_t, int32_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, float, int32_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, double, int32_t, binop) \ + \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, int32_t, int64_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, float, int64_t, binop) \ + INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, double, int64_t, binop) } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index ddc8afce08f..2e1314a4a95 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -19,11 +19,64 @@ limitations under the License. namespace oneflow { namespace user_op { +#define IMPLEMENT_DIMGATHER_KERNEL_CLASS(binop) \ + template \ + class DimGather##binop##Kernel final : public DimGatherBaseKernel { \ + public: \ + DimGather##binop##Kernel() = default; \ + ~DimGather##binop##Kernel() override = default; \ + \ + private: \ + void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, \ + IN_T* output) const override { \ + DimGather##binop##Functor()( \ + ctx, input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, output); \ + } \ + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ + }; + +#define REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +#define REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) + +#define REGISTER_DIM_GATHER_OUTPLACE_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); + +#ifdef WITH_CUDA +#define REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_GATHER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#else +#define REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop) +#endif // WITH_CUDA + +#define REGISTER_GATHER_OUTPLACE_KERNEL(optypename, binop) \ + REGISTER_DIM_GATHER_OUTPLACE_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop); + template -class DimGatherKernel final : public user_op::OpKernel { +class DimGatherBaseKernel : public user_op::OpKernel { public: - DimGatherKernel() = default; - ~DimGatherKernel() override = default; + DimGatherBaseKernel() = default; + ~DimGatherBaseKernel() override = default; + virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, + IN_T* output) const = 0; private: void Compute(KernelComputeContext* ctx) const override { @@ -47,35 +100,17 @@ class DimGatherKernel final : public user_op::OpKernel { shape2dims(index_tensor->shape()); DimOpIndexNdHelper index_nd_helper(shape_vec.data(), ndim); - DimGatherFunctor()( - ctx->device_ctx(), input_nd_helper, index_nd_helper, ndim, index_tensor->shape().elem_cnt(), - dim, index, input, output); + BinaryOp(ctx->device_ctx(), input_nd_helper, index_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, index, input, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_gather") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); +IMPLEMENT_DIMGATHER_KERNEL_CLASS(Update); +IMPLEMENT_DIMGATHER_KERNEL_CLASS(Add); -#define REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(device) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int32_t) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int32_t) \ - REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int64_t) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int64_t) \ - REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t) - -REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kCPU); - -#ifdef WITH_CUDA -REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kGPU); -REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t); -#endif // WITH_CUDA +REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather", Update); +REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather_add", Add); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 4a25d74b7ab..2f2395dda0e 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -23,14 +23,6 @@ limitations under the License. namespace oneflow { -#define DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ - FLOATING_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -#define DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ \ - DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ - FLOAT16_DATA_TYPE_SEQ - constexpr int kDimGatherMaxDimCount = 8; namespace user_op { @@ -41,28 +33,6 @@ using DimOpIndexNdHelper = NdIndexOffsetHelper; template using BinaryOpFn = void (*)(const T* x, T* y); -// Steps for adding a binary operation on scatter are as follows: -// 1. implment binop in DeviceBinOp, for example "Mul": -// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } -// -// 2. Implement kernels in dim_scatter_kernels.cpp: -// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Mul); -// -// 3. Register kernels -// REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_add_like", Add); -// -// 4. Declare Functor in dim_scatter_kernel_util.h: -// DECLARE_DIMSCATTER_FUNCTOR(Mul); -// -// 5. Implement functors in dim_scatter_kernel_util.cu and cpp file: -// in .cu file: -// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); -// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Mul); -// in .cpp file: -// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Mul); -// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Mul); -// - template struct DeviceBinOp { OF_DEVICE_FUNC static void Add(const T* x, T* y) { @@ -76,6 +46,7 @@ struct DeviceBinOp { OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } }; +// ----- macros for scatter functors ----- #define DECLARE_DIMSCATTER_FUNCTOR(binop) \ template \ struct DimScatter##binop##Functor final { \ @@ -125,6 +96,56 @@ struct DeviceBinOp { } \ } +// ----- macros for gather functors ----- +#define DECLARE_DIMGATHER_FUNCTOR(binop) \ + template \ + struct DimGather##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output); \ + } + +#define IMPLEMENT_DIMGATHER_CPUFUNCTOR(binop) \ + template \ + struct DimGather##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ + DoDimGatherBinop(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ + input, output, DeviceBinOp::binop); \ + } \ + } + +#define IMPLEMENT_DIMGATHER_GPUFUNCTOR(binop) \ + template \ + __global__ void DoCUDADimGather##binop(const DimOpIndexNdHelper input_nd_helper, \ + const DimOpIndexNdHelper index_nd_helper, \ + int ndim, int64_t elem_cnt, int32_t dim, \ + const IDX_T* index, const IN_T* input, IN_T* output) { \ + DoDimGatherBinop(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ + input, output, DeviceBinOp::binop); \ + } \ + template \ + struct DimGather##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ + RUN_CUDA_KERNEL((DoCUDADimGather##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, \ + output); \ + } \ + }; \ + template \ + struct DimGather##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const float16* input, float16* output) { \ + RUN_CUDA_KERNEL((DoCUDADimGather##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ + reinterpret_cast(input), reinterpret_cast(output)); \ + } \ + }; + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 1911f8fdf71..b502edf08c8 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -17,6 +17,28 @@ limitations under the License. #define ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #include "oneflow/user/kernels/dim_gather_scatter_util.h" +// Steps for adding a binary operation on scatter are as follows: +// 1. implment binop in DeviceBinOp, for example "Mul": +// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } +// +// 2. Implement kernels in dim_scatter_kernels.cpp: +// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Mul); +// +// 3. Register kernels +// REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_mul_like", Mul); +// +// 4. Declare Functor in dim_scatter_kernel_util.h: +// DECLARE_DIMSCATTER_FUNCTOR(Mul); +// +// 5. Implement functors in dim_scatter_kernel_util.cu and cpp file: +// in .cu file: +// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); +// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Mul); +// in .cpp file: +// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Mul); +// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Mul); +// + namespace oneflow { namespace user_op { diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index accb198c5b1..ed1de33f47c 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -36,75 +36,74 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } -#define REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ +#define REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) #define REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop) \ REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ +#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else #define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_SCATTER_OUTPLACE_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop); \ +#define REGISTER_SCATTER_OUTPLACE_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop); \ REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop); // ---- REGISTER INPLACE OPS ---- -Maybe SetInplace(const user_op::InferContext&, - user_op::AddInplaceArgPair AddInplaceArgPairFn){ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "like", 0, true)); - return Maybe::Ok(); +Maybe SetInplace(const user_op::InferContext&, + user_op::AddInplaceArgPair AddInplaceArgPairFn) { + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "like", 0, true)); + return Maybe::Ok(); } -#define REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value))\ +#define REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)) \ .SetInplaceProposalFn(SetInplace); - - -#define REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int64_t, optypename, binop) \ + +#define REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int64_t, optypename, binop) \ REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) #define REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop) \ REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ +#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else #define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_SCATTER_INTPLACE_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop); \ +#define REGISTER_SCATTER_INTPLACE_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop); \ REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop); template @@ -129,16 +128,18 @@ class DimScatterBaseKernel : public user_op::OpKernel { IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - if(!like_tensor->dptr()){ - Memset(ctx->device_ctx(), output, 0, out_bytes_size); - }else { + if (!like_tensor->dptr()) { + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } else { const IN_T* like = like_tensor->dptr(); - if(output != like) - { - Memcpy(ctx->device_ctx(), output, like_tensor->dptr(), out_bytes_size); - } + if (output != like) { + // wrong at 1n2c + Memcpy(ctx->device_ctx(), output, like_tensor->dptr(), out_bytes_size); + // right at 1n2c (??) + // Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } } int ndim = input_tensor->shape().NumAxes(); diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 95bde8b7aea..57fba27241c 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -17,99 +17,107 @@ limitations under the License. #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { - namespace user_op { + +namespace { +Maybe InferTensorDesc(user_op::InferContext* ctx) { + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + int64_t input_num_axes = in->shape().NumAxes(); + CHECK_GT_OR_RETURN(input_num_axes, 0); + CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); + + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + int64_t index_num_axes = index->shape().NumAxes(); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + + const int32_t dim = ctx->Attr("dim"); + CHECK_GE_OR_RETURN(dim, 0); + CHECK_LT_OR_RETURN(dim, input_num_axes); + CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); + + // split_axs should NOT equals dim when in consistent view + const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + auto is_split = in_sbp.has_split_parallel(); + if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { + int64_t split_axis = in_sbp.split_parallel().axis(); + CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; + } + + CHECK_OR_RETURN(!in->is_dynamic()); + CHECK_OR_RETURN(!index->is_dynamic()); + + FOR_RANGE(int64_t, i, 0, input_num_axes) { + if (i == dim) { continue; } + CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); + } + + user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); + *out->mut_shape() = index->shape(); + *out->mut_data_type() = in->data_type(); + + return Maybe::Ok(); +} + +void GatherInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); + CHECK(indices_modifier != nullptr); + indices_modifier->set_requires_grad(false); +} + +Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { + OptInt64* indices_batch_axis = ctx->BatchAxis4ArgNameAndIndex("index", 0); + if (indices_batch_axis->has_value()) { + CHECK_GE_OR_RETURN(indices_batch_axis->value(), 0); + CHECK_LE_OR_RETURN( + indices_batch_axis->value(), + ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0).shape().NumAxes() - 1); + } + *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *indices_batch_axis; + return Maybe::Ok(); +} + +Maybe BuildSbp(user_op::SbpContext* ctx) { + const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); + int64_t index_num_axes = index_tensor.shape().NumAxes(); + const int32_t dim = ctx->Attr("dim"); + + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if (i != dim) { + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Build(); + } else if (i == dim) { + ctx->NewBuilder() + .Broadcast(user_op::OpArg("input", 0)) + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Build(); + } + } + + ctx->NewBuilder() + .PartialSum(user_op::OpArg("input", 0)) + .Broadcast(user_op::OpArg("index", 0)) + .PartialSum(user_op::OpArg("output", 0)) + .Build(); + return Maybe::Ok(); +} +} // namespace + REGISTER_USER_OP("dim_gather") .Input("input") .Input("index") .Output("output") .Attr("dim") - .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { - const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); - int64_t input_num_axes = in->shape().NumAxes(); - CHECK_GT_OR_RETURN(input_num_axes, 0); - CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); - - const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - int64_t index_num_axes = index->shape().NumAxes(); - CHECK_OR_RETURN(IsIndexDataType(index->data_type())); - - const int32_t dim = ctx->Attr("dim"); - CHECK_GE_OR_RETURN(dim, 0); - CHECK_LT_OR_RETURN(dim, input_num_axes); - CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - - // split_axs should NOT equals dim when in consistent view - const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - auto is_split = in_sbp.has_split_parallel(); - if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { - int64_t split_axis = in_sbp.split_parallel().axis(); - CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - } - - CHECK_OR_RETURN(!in->is_dynamic()); - CHECK_OR_RETURN(!index->is_dynamic()); - - FOR_RANGE(int64_t, i, 0, input_num_axes) { - if (i == dim) { continue; } - CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); - } - - user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - *out->mut_shape() = index->shape(); - *out->mut_data_type() = in->data_type(); - - return Maybe::Ok(); - }) - .SetInputArgModifyFn([](user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); - CHECK(indices_modifier != nullptr); - indices_modifier->set_requires_grad(false); - }) - .SetBatchAxisInferFn([](user_op::BatchAxisContext* ctx) -> Maybe { - OptInt64* indices_batch_axis = ctx->BatchAxis4ArgNameAndIndex("index", 0); - if (indices_batch_axis->has_value()) { - CHECK_GE_OR_RETURN(indices_batch_axis->value(), 0); - CHECK_LE_OR_RETURN( - indices_batch_axis->value(), - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0).shape().NumAxes() - 1); - } - *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *indices_batch_axis; - return Maybe::Ok(); - }) - .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { - const user_op::TensorDesc& index_tensor = - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); - int64_t index_num_axes = index_tensor.shape().NumAxes(); - const int32_t dim = ctx->Attr("dim"); - - FOR_RANGE(int64_t, i, 0, index_num_axes) { - if (i != dim) { - ctx->NewBuilder() - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) - .Split(user_op::OpArg("output", 0), i) - .Build(); - } else if (i == dim) { - ctx->NewBuilder() - .Broadcast(user_op::OpArg("input", 0)) - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("output", 0), i) - .Build(); - } - } - - ctx->NewBuilder() - .PartialSum(user_op::OpArg("input", 0)) - .Broadcast(user_op::OpArg("index", 0)) - .PartialSum(user_op::OpArg("output", 0)) - .Build(); - return Maybe::Ok(); - }); + .SetTensorDescInferFn(InferTensorDesc) + .SetInputArgModifyFn(GatherInputArgModifierFn) + .SetBatchAxisInferFn(InferBatchAxis) + .SetGetSbpFn(BuildSbp); REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { - const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 99a20634436..229bcf1fae2 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -65,7 +65,7 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF } Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { + const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); like_arg_modifier->set_requires_grad(false); @@ -105,29 +105,29 @@ Maybe SetSbp(user_op::SbpContext* ctx) { } } // namespace -#define REGISTER_SCATTER_LIKE_OP(optypename) \ -REGISTER_USER_OP(optypename) \ - .Input("like") \ - .Input("input") \ - .Input("index") \ - .Output("output") \ - .Attr("dim") \ - .SetTensorDescInferFn(InferTensorDesc) \ - .SetInputArgModifyFn(InputArgModifierFn) \ - .SetBatchAxisInferFn(InferBatchAxis) \ - .SetGetSbpFn(SetSbp) - -#define REGISTER_SCATTER_INPLACE_OP(optypename) \ -REGISTER_USER_OP(optypename) \ - .Input("like") \ - .Input("input") \ - .Input("index") \ - .Output("output") \ - .Attr("dim") \ - .SetTensorDescInferFn(InferTensorDesc) \ - .SetInputArgModifyFn(InplaceInputArgModifierFn) \ - .SetBatchAxisInferFn(InferBatchAxis) \ - .SetGetSbpFn(SetSbp) +#define REGISTER_SCATTER_LIKE_OP(optypename) \ + REGISTER_USER_OP(optypename) \ + .Input("like") \ + .Input("input") \ + .Input("index") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferTensorDesc) \ + .SetInputArgModifyFn(InputArgModifierFn) \ + .SetBatchAxisInferFn(InferBatchAxis) \ + .SetGetSbpFn(SetSbp) + +#define REGISTER_SCATTER_INPLACE_OP(optypename) \ + REGISTER_USER_OP(optypename) \ + .Input("like") \ + .Input("input") \ + .Input("index") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferTensorDesc) \ + .SetInputArgModifyFn(InplaceInputArgModifierFn) \ + .SetBatchAxisInferFn(InferBatchAxis) \ + .SetGetSbpFn(SetSbp) REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); From 99be5ad6e3e1db991a39d03526be18e105952894 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Thu, 26 Nov 2020 13:28:31 +0800 Subject: [PATCH 11/82] add grad of scatter_add_like --- oneflow/user/ops/dim_gather_op.cpp | 9 +++++---- oneflow/user/ops/dim_scatter_ops.cpp | 29 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 57fba27241c..e2eebeb0f59 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -46,10 +46,11 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_OR_RETURN(!in->is_dynamic()); CHECK_OR_RETURN(!index->is_dynamic()); - FOR_RANGE(int64_t, i, 0, input_num_axes) { - if (i == dim) { continue; } - CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); - } + // for scatter backword, this check moved to python + // FOR_RANGE(int64_t, i, 0, input_num_axes) { + // if (i == dim) { continue; } + // CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); + // } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = index->shape(); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 229bcf1fae2..ddc617cc130 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -61,6 +61,10 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF CHECK(like_arg_modifier != nullptr); like_arg_modifier->set_use_header_only(true); like_arg_modifier->set_requires_grad(false); + + user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); + CHECK(indices_modifier != nullptr); + indices_modifier->set_requires_grad(false); return Maybe::Ok(); } @@ -69,6 +73,10 @@ Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgMo user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); like_arg_modifier->set_requires_grad(false); + + user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); + CHECK(indices_modifier != nullptr); + indices_modifier->set_requires_grad(false); return Maybe::Ok(); } @@ -129,6 +137,27 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .SetBatchAxisInferFn(InferBatchAxis) \ .SetGetSbpFn(SetSbp) +REGISTER_USER_OP_GRAD("dim_scatter_add_like") + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { + const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; + + ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { + return builder + .OpTypeName("dim_gather") // dim_gather(grad, dim, index) -> output + .InputBind("index", ctx->FwOp().input("index", 0)) // gather.index <- scatter.index + .InputBind("input", + ctx->FwOp().output_grad("output", 0)) // gather.input <- grad of scatter.out + .Output("output") + .Attr("dim", ctx->FwOp().attr("dim")) + .Build(); + }); + + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), + [&ctx, &op_grad_name]() -> const std::string& { + return ctx->GetOp(op_grad_name).output("output", 0); + }); + }); + REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_INPLACE_OP("dim_scatter_add"); From 4f89f18c04d986a7dcedaefe74b1b10b0cc57803 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 15 Dec 2020 12:26:44 +0800 Subject: [PATCH 12/82] refine (add src, like versions for scatter) --- oneflow/user/kernels/dim_scatter_kernels.cpp | 16 +++++----- oneflow/user/ops/dim_scatter_ops.cpp | 31 ++++++++++++++------ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index ed1de33f47c..37d0c00032d 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/error.pb.h" #include "oneflow/core/common/util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" @@ -130,16 +131,13 @@ class DimScatterBaseKernel : public user_op::OpKernel { out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - if (!like_tensor->dptr()) { + Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); + if(src_tensor){ + Memcpy(ctx->device_ctx(), output, src_tensor->dptr(), out_bytes_size); + }else if(like_tensor){ Memset(ctx->device_ctx(), output, 0, out_bytes_size); - } else { - const IN_T* like = like_tensor->dptr(); - if (output != like) { - // wrong at 1n2c - Memcpy(ctx->device_ctx(), output, like_tensor->dptr(), out_bytes_size); - // right at 1n2c (??) - // Memset(ctx->device_ctx(), output, 0, out_bytes_size); - } + }else{ + Error::Unimplemented(); } int ndim = input_tensor->shape().NumAxes(); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index ddc617cc130..e6366da4984 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include "oneflow/core/common/error.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/framework/user_op_registry.h" #include "oneflow/user/kernels/dim_gather_scatter_util.h" @@ -26,8 +28,8 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); - - const Shape& like_shape = like->shape(); + const TensorDesc* src = ctx->TensorDesc4ArgNameAndIndex("src", 0); + int32_t dim = ctx->Attr("dim"); const SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); @@ -42,14 +44,23 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int64_t index_num_axes = index->shape().NumAxes(); CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - CHECK_EQ_OR_RETURN(input_num_axes, like_shape.NumAxes()); + + int64_t output_num_axes = 0; + if(src){ + output_num_axes = src->shape().NumAxes(); + }else if(like){ + output_num_axes = like->shape().NumAxes(); + }else{ + Error::Unimplemented(); + } + CHECK_EQ_OR_RETURN(input_num_axes, output_num_axes); FOR_RANGE(int64_t, i, 0, input_num_axes) { CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - *out->mut_shape() = like_shape; + *out->mut_shape() = src?src->shape():like->shape(); *out->mut_data_type() = input->data_type(); return Maybe::Ok(); @@ -70,9 +81,9 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); - CHECK(like_arg_modifier != nullptr); - like_arg_modifier->set_requires_grad(false); + user_op::InputArgModifier* src_arg_modifier = GetInputArgModifierFn("src", 0); + CHECK(src_arg_modifier != nullptr); + src_arg_modifier->set_requires_grad(false); user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); @@ -99,6 +110,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .Split(user_op::OpArg("input", 0), i) .Split(user_op::OpArg("output", 0), i) .Split(user_op::OpArg("like", 0), i) + .Split(user_op::OpArg("src", 0), i) .Build(); } } @@ -108,6 +120,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .Broadcast(user_op::OpArg("index", 0)) .PartialSum(user_op::OpArg("output", 0)) .PartialSum(user_op::OpArg("like", 0)) + .PartialSum(user_op::OpArg("src", 0)) .Build(); return Maybe::Ok(); } @@ -115,7 +128,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { #define REGISTER_SCATTER_LIKE_OP(optypename) \ REGISTER_USER_OP(optypename) \ - .Input("like") \ + .OptionalInput("like") \ .Input("input") \ .Input("index") \ .Output("output") \ @@ -127,7 +140,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { #define REGISTER_SCATTER_INPLACE_OP(optypename) \ REGISTER_USER_OP(optypename) \ - .Input("like") \ + .OptionalInput("src") \ .Input("input") \ .Input("index") \ .Output("output") \ From 35c7457439ba80fa9550e094634625eb541a15e7 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 15 Dec 2020 15:34:42 +0800 Subject: [PATCH 13/82] refine src/like tensor --- oneflow/python/ops/array_ops.py | 32 ++++++++++++--- oneflow/user/kernels/dim_scatter_kernels.cpp | 13 +++--- oneflow/user/ops/dim_scatter_ops.cpp | 43 +++++++++++--------- 3 files changed, 58 insertions(+), 30 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 8b1f85b5dd2..27b0d3f0c41 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2337,12 +2337,34 @@ def dim_scatter_update_like( .RemoteBlobList()[0] ) +@oneflow_export("dim_scatter_update") +def dim_scatter_update_like( + dim: int, + index: remote_blob_util.BlobDef, + input: remote_blob_util.BlobDef, + src: remote_blob_util.BlobDef, + name: Optional[str] = None, +) -> remote_blob_util.BlobDef: + return ( + flow.user_op_builder( + name if name is not None else id_util.UniqueStr("DimScatterUpdate_") + ) + .Op("dim_scatter_update") + .Input("input", [input]) + .Input("index", [index]) + .Input("src", [src]) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] + ) @oneflow_export("dim_scatter_add_like") def dim_scatter_add_like( dim: int, index: remote_blob_util.BlobDef, - src: remote_blob_util.BlobDef, + input: remote_blob_util.BlobDef, like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: @@ -2351,7 +2373,7 @@ def dim_scatter_add_like( name if name is not None else id_util.UniqueStr("DimScatterAddLike_") ) .Op("dim_scatter_add_like") - .Input("input", [src]) + .Input("input", [input]) .Input("index", [index]) .Input("like", [like]) .Output("output") @@ -2366,8 +2388,8 @@ def dim_scatter_add_like( def dim_scatter_add( dim: int, index: remote_blob_util.BlobDef, + input: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, - like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: return ( @@ -2375,9 +2397,9 @@ def dim_scatter_add( name if name is not None else id_util.UniqueStr("DimScatterAddLike_") ) .Op("dim_scatter_add") - .Input("input", [src]) + .Input("input", [input]) .Input("index", [index]) - .Input("like", [like]) + .Input("src", [src]) .Output("output") .Attr("dim", int(dim)) .Build() diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 37d0c00032d..4add3bd79eb 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -30,9 +30,9 @@ namespace user_op { private: \ void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const override { \ + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) const override { \ DimScatter##binop##Functor()( \ - ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, output); \ + ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); \ } \ bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } @@ -71,7 +71,7 @@ namespace user_op { // ---- REGISTER INPLACE OPS ---- Maybe SetInplace(const user_op::InferContext&, user_op::AddInplaceArgPair AddInplaceArgPairFn) { - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "like", 0, true)); + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "src", 0, true)); return Maybe::Ok(); } @@ -114,7 +114,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { ~DimScatterBaseKernel() override = default; virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* src, + int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) const = 0; private: @@ -124,7 +124,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); const int32_t dim = ctx->Attr("dim"); - const IN_T* src = input_tensor->dptr(); + const IN_T* input = input_tensor->dptr(); const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = @@ -152,7 +152,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); BinaryOp(ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, - input_tensor->shape().elem_cnt(), dim, index, src, output); + input_tensor->shape().elem_cnt(), dim, index, input, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -163,6 +163,7 @@ IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_add_like", Add); REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_update_like", Update); REGISTER_SCATTER_INTPLACE_KERNEL("dim_scatter_add", Add); +REGISTER_SCATTER_INTPLACE_KERNEL("dim_scatter_update", Update); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index e6366da4984..36b75f6b60d 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -150,30 +150,35 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .SetBatchAxisInferFn(InferBatchAxis) \ .SetGetSbpFn(SetSbp) -REGISTER_USER_OP_GRAD("dim_scatter_add_like") - .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { - const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; - - ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { - return builder - .OpTypeName("dim_gather") // dim_gather(grad, dim, index) -> output - .InputBind("index", ctx->FwOp().input("index", 0)) // gather.index <- scatter.index - .InputBind("input", - ctx->FwOp().output_grad("output", 0)) // gather.input <- grad of scatter.out - .Output("output") - .Attr("dim", ctx->FwOp().attr("dim")) - .Build(); +#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ + REGISTER_USER_OP_GRAD(optypename) \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) {\ + const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; \ + ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) {\ + return builder\ + .OpTypeName("dim_gather") \ + .InputBind("index", ctx->FwOp().input("index", 0))\ + .InputBind("input", ctx->FwOp().output_grad("output", 0))\ + .Output("output")\ + .Attr("dim", ctx->FwOp().attr("dim"))\ + .Build();\ + });\ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0),\ + [&ctx, &op_grad_name]() -> const std::string& {\ + return ctx->GetOp(op_grad_name).output("output", 0);\ + });\ }); - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), - [&ctx, &op_grad_name]() -> const std::string& { - return ctx->GetOp(op_grad_name).output("output", 0); - }); - }); - REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_INPLACE_OP("dim_scatter_add"); +REGISTER_SCATTER_INPLACE_OP("dim_scatter_update"); + +REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add_like"); +REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update_like"); +REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); +REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); + } // namespace user_op } // namespace oneflow \ No newline at end of file From bd2520c547aafb26a19df65d304e27578e318bad Mon Sep 17 00:00:00 2001 From: doombeaker Date: Wed, 16 Dec 2020 09:27:19 +0800 Subject: [PATCH 14/82] gather refine(no need outplace/inplace versions) --- .../user/kernels/dim_gather_kernel_util.cpp | 3 -- .../user/kernels/dim_gather_kernel_util.cu | 3 -- oneflow/user/kernels/dim_gather_kernel_util.h | 3 +- oneflow/user/kernels/dim_gather_kernels.cpp | 41 +++++++++---------- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cpp b/oneflow/user/kernels/dim_gather_kernel_util.cpp index be8fde77d04..f8c893f0169 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cpp +++ b/oneflow/user/kernels/dim_gather_kernel_util.cpp @@ -23,8 +23,5 @@ namespace user_op { IMPLEMENT_DIMGATHER_CPUFUNCTOR(Update); INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Update); -IMPLEMENT_DIMGATHER_CPUFUNCTOR(Add); -INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Add); - } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu index c60421110f2..5b9e818ed99 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cu +++ b/oneflow/user/kernels/dim_gather_kernel_util.cu @@ -24,9 +24,6 @@ namespace user_op { IMPLEMENT_DIMGATHER_GPUFUNCTOR(Update); INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Update); -IMPLEMENT_DIMGATHER_GPUFUNCTOR(Add); -INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Add); - } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.h b/oneflow/user/kernels/dim_gather_kernel_util.h index 6de537f3c5c..e717c9be736 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.h +++ b/oneflow/user/kernels/dim_gather_kernel_util.h @@ -35,14 +35,13 @@ limitations under the License. // 4. Implement kernels in dim_gather_kernels.cpp: // IMPLEMENT_DIMGATHER_KERNEL_CLASS(Mul); // -// 5. Register kernels +// 5. Register kernels in dim_gather_kernels.cpp: // REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather_mul_like", Mul); namespace oneflow { namespace user_op { DECLARE_DIMGATHER_FUNCTOR(Update); -DECLARE_DIMGATHER_FUNCTOR(Add); template OF_DEVICE_FUNC void DoDimGatherBinop(const DimOpIndexNdHelper& input_nd_helper, diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index 2e1314a4a95..dedc3f8eb9f 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -37,36 +37,36 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ }; -#define REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) +#define REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t, optypename, binop) -#define REGISTER_DIM_GATHER_OUTPLACE_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); +#define REGISTER_DIM_GATHER_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_GATHER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_GATHER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#define REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else -#define REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop) +#define REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_GATHER_OUTPLACE_KERNEL(optypename, binop) \ - REGISTER_DIM_GATHER_OUTPLACE_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_GATHER_OUTPLACE_GPUKERNELS(optypename, binop); +#define REGISTER_GATHER_KERNEL(optypename, binop) \ + REGISTER_DIM_GATHER_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop); template class DimGatherBaseKernel : public user_op::OpKernel { @@ -107,10 +107,7 @@ class DimGatherBaseKernel : public user_op::OpKernel { }; IMPLEMENT_DIMGATHER_KERNEL_CLASS(Update); -IMPLEMENT_DIMGATHER_KERNEL_CLASS(Add); - -REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather", Update); -REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather_add", Add); +REGISTER_GATHER_KERNEL("dim_gather", Update); } // namespace user_op } // namespace oneflow From c5515a0c32795641f20faa615b2e5ed1423cfac9 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Wed, 16 Dec 2020 17:15:19 +0800 Subject: [PATCH 15/82] reformat --- oneflow/python/ops/array_ops.py | 2 + oneflow/user/kernels/dim_gather_kernels.cpp | 16 +++---- oneflow/user/kernels/dim_scatter_kernels.cpp | 37 +++++++-------- oneflow/user/ops/dim_scatter_ops.cpp | 48 ++++++++++---------- 4 files changed, 52 insertions(+), 51 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 27b0d3f0c41..7abfe60d307 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2337,6 +2337,7 @@ def dim_scatter_update_like( .RemoteBlobList()[0] ) + @oneflow_export("dim_scatter_update") def dim_scatter_update_like( dim: int, @@ -2360,6 +2361,7 @@ def dim_scatter_update_like( .RemoteBlobList()[0] ) + @oneflow_export("dim_scatter_add_like") def dim_scatter_add_like( dim: int, diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index dedc3f8eb9f..ac2c62acbe8 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -37,19 +37,19 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ }; -#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int64_t, optypename, binop) \ +#define REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int64_t, optypename, binop) \ REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t, optypename, binop) #define REGISTER_DIM_GATHER_CPUKERNELS(optypename, binop) \ @@ -57,7 +57,7 @@ namespace user_op { #ifdef WITH_CUDA #define REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 4add3bd79eb..42613ea083e 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -20,21 +20,22 @@ limitations under the License. namespace oneflow { namespace user_op { -#define IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop) \ - template \ - class DimScatter##binop##Kernel final : public DimScatterBaseKernel { \ - public: \ - DimScatter##binop##Kernel() = default; \ - ~DimScatter##binop##Kernel() override = default; \ - \ - private: \ - void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) const override { \ - DimScatter##binop##Functor()( \ - ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); \ - } \ - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ +#define IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop) \ + template \ + class DimScatter##binop##Kernel final : public DimScatterBaseKernel { \ + public: \ + DimScatter##binop##Kernel() = default; \ + ~DimScatter##binop##Kernel() override = default; \ + \ + private: \ + void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ + int32_t dim, const IDX_T* index, const IN_T* input, \ + IN_T* output) const override { \ + DimScatter##binop##Functor()( \ + ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); \ + } \ + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } #define REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ @@ -132,11 +133,11 @@ class DimScatterBaseKernel : public user_op::OpKernel { Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); - if(src_tensor){ + if (src_tensor) { Memcpy(ctx->device_ctx(), output, src_tensor->dptr(), out_bytes_size); - }else if(like_tensor){ + } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); - }else{ + } else { Error::Unimplemented(); } diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 36b75f6b60d..ea618590062 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -29,7 +29,7 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); const TensorDesc* src = ctx->TensorDesc4ArgNameAndIndex("src", 0); - + int32_t dim = ctx->Attr("dim"); const SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); @@ -46,11 +46,11 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); int64_t output_num_axes = 0; - if(src){ - output_num_axes = src->shape().NumAxes(); - }else if(like){ - output_num_axes = like->shape().NumAxes(); - }else{ + if (src) { + output_num_axes = src->shape().NumAxes(); + } else if (like) { + output_num_axes = like->shape().NumAxes(); + } else { Error::Unimplemented(); } CHECK_EQ_OR_RETURN(input_num_axes, output_num_axes); @@ -60,7 +60,7 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - *out->mut_shape() = src?src->shape():like->shape(); + *out->mut_shape() = src ? src->shape() : like->shape(); *out->mut_data_type() = input->data_type(); return Maybe::Ok(); @@ -150,23 +150,22 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .SetBatchAxisInferFn(InferBatchAxis) \ .SetGetSbpFn(SetSbp) -#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ - REGISTER_USER_OP_GRAD(optypename) \ - .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) {\ - const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; \ - ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) {\ - return builder\ - .OpTypeName("dim_gather") \ - .InputBind("index", ctx->FwOp().input("index", 0))\ - .InputBind("input", ctx->FwOp().output_grad("output", 0))\ - .Output("output")\ - .Attr("dim", ctx->FwOp().attr("dim"))\ - .Build();\ - });\ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0),\ - [&ctx, &op_grad_name]() -> const std::string& {\ - return ctx->GetOp(op_grad_name).output("output", 0);\ - });\ +#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ + REGISTER_USER_OP_GRAD(optypename) \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ + const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; \ + ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_gather") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + [&ctx, &op_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_grad_name).output("output", 0); \ + }); \ }); REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); @@ -179,6 +178,5 @@ REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update_like"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); - } // namespace user_op } // namespace oneflow \ No newline at end of file From e1842934eee9a1e45b363e6067ec54ed3e421b9c Mon Sep 17 00:00:00 2001 From: doombeaker Date: Wed, 16 Dec 2020 18:00:28 +0800 Subject: [PATCH 16/82] refine --- oneflow/python/ops/array_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 7abfe60d307..26b0f1ef712 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2318,7 +2318,7 @@ def amp_white_identity( def dim_scatter_update_like( dim: int, index: remote_blob_util.BlobDef, - src: remote_blob_util.BlobDef, + input: remote_blob_util.BlobDef, like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: @@ -2327,7 +2327,7 @@ def dim_scatter_update_like( name if name is not None else id_util.UniqueStr("DimScatterUpdateLike_") ) .Op("dim_scatter_update_like") - .Input("input", [src]) + .Input("input", [input]) .Input("index", [index]) .Input("like", [like]) .Output("output") @@ -2339,7 +2339,7 @@ def dim_scatter_update_like( @oneflow_export("dim_scatter_update") -def dim_scatter_update_like( +def dim_scatter_update( dim: int, index: remote_blob_util.BlobDef, input: remote_blob_util.BlobDef, From c52946164d4cb3064ffa5ef851531d24e4de97cf Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 28 Dec 2020 18:38:49 +0800 Subject: [PATCH 17/82] test case of dim scatter --- .../python/test/ops/test_dim_scatter_add.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 oneflow/python/test/ops/test_dim_scatter_add.py diff --git a/oneflow/python/test/ops/test_dim_scatter_add.py b/oneflow/python/test/ops/test_dim_scatter_add.py new file mode 100644 index 00000000000..61d3101049d --- /dev/null +++ b/oneflow/python/test/ops/test_dim_scatter_add.py @@ -0,0 +1,69 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import oneflow as flow +import numpy as np +import oneflow.typing as oft +from test_util import GenArgList +import unittest +from collections import OrderedDict +import os + + +def gen_scatter_add_like_test_sample( + input_shape, index_shape, dim, like_shape, is_float=True +): + def _np_dim_scatter_add_like(input, dim, index, like): + output = np.zeros(like.shape) + for inputidx in range(0, input.size): + outcoord = np.unravel_index(inputidx, input.shape) + outcoord = [*outcoord] + outcoord[dim] = index[np.unravel_index(inputidx, index.shape)] + output_offset = np.ravel_multi_index(outcoord, like_shape) + output[np.unravel_index(output_offset, like_shape)] += input[ + np.unravel_index(inputidx, input.shape) + ] + return output + like = np.random.randint(0, 100, like_shape) + if is_float: + input = np.random.random(input_shape) + else: + input = np.random.randint(0, 100, input_shape) + + def _np_dim_gather(dim, input, index): + output = np.zeros(index.shape) + for idx in range(0, index.size): + incoord = np.unravel_index(idx, index.shape) + outcoord=[*incoord] + incoord = [*incoord] + incoord[dim] = index[np.unravel_index(idx, index.shape)] + output[tuple(outcoord)] = input[tuple(incoord)] + return output + + index = np.random.randint(0, like_shape[dim], index_shape) + + output = _np_dim_scatter_add_like(input, dim, index, like) + grad = _np_dim_gather(dim, output, index) + return { + "input": input, + "index": index, + "like": like, + "dim": dim, + "output": output, + "grad": grad + } + +sample = gen_scatter_add_like_test_sample((2, 2), (2, 2), 0, (4,4)) +print(sample) From 02fa7ee9f3ed7b674928cea60e8cd60df9d995c3 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 28 Dec 2020 20:57:03 +0800 Subject: [PATCH 18/82] test case for dim_scatter_add_like --- .../python/test/ops/test_dim_scatter_add.py | 131 +++++++++++++++++- 1 file changed, 127 insertions(+), 4 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_add.py b/oneflow/python/test/ops/test_dim_scatter_add.py index 61d3101049d..12d1b4b4a3e 100644 --- a/oneflow/python/test/ops/test_dim_scatter_add.py +++ b/oneflow/python/test/ops/test_dim_scatter_add.py @@ -36,11 +36,13 @@ def _np_dim_scatter_add_like(input, dim, index, like): np.unravel_index(inputidx, input.shape) ] return output - like = np.random.randint(0, 100, like_shape) + if is_float: input = np.random.random(input_shape) + like = np.random.random(like_shape) else: input = np.random.randint(0, 100, input_shape) + like = np.random.randint(0, 100, like_shape) def _np_dim_gather(dim, input, index): output = np.zeros(index.shape) @@ -55,7 +57,7 @@ def _np_dim_gather(dim, input, index): index = np.random.randint(0, like_shape[dim], index_shape) output = _np_dim_scatter_add_like(input, dim, index, like) - grad = _np_dim_gather(dim, output, index) + grad = _np_dim_gather(dim, np.ones(output.shape), index) return { "input": input, "index": index, @@ -65,5 +67,126 @@ def _np_dim_gather(dim, input, index): "grad": grad } -sample = gen_scatter_add_like_test_sample((2, 2), (2, 2), 0, (4,4)) -print(sample) +def _gen_arg_dict( + device_type="gpu", value_type="float", machine_ids="0:0", device_count=1 +): + arg_dict = OrderedDict() + arg_dict["device_type"] = [device_type] + arg_dict["samples"] = [] + arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 1, (4, 4))) + if value_type == "float": + arg_dict["value_type"] = [ + (np.float32, flow.float32), + ] + elif value_type == "int": + arg_dict["value_type"] = [(np.float32, flow.int32)] + else: + raise "float or int for value type only" + + arg_dict["index_type"] = [(np.int32, flow.int32)] + arg_dict["machine_ids"] = [machine_ids] + arg_dict["device_count"] = [device_count] + return arg_dict + +def _make_dim_scatter_add_like_fn( + test_case, + input, + index, + dim, + like, + grad, + device_type, + value_type, + index_type, + machine_ids, + device_counts, +): + flow.clear_default_session() + if device_type == "cpu": + flow.config.cpu_device_num(device_counts) + else: + flow.config.gpu_device_num(device_counts) + + func_config = flow.FunctionConfig() + + # global function needs float32 as type of argument and return value + if value_type == flow.float16: + func_config.default_data_type(flow.float32) + else: + func_config.default_data_type(value_type) + + func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids)) + func_config.default_logical_view(flow.scope.consistent_view()) + + def _compare_diff(blob: oft.Numpy): + test_case.assertTrue(np.allclose(grad, blob)) + + if value_type == flow.float32 or value_type == flow.float64: + + @flow.global_function(type="train", function_config=func_config) + def scatter_add_like_fn( + params_def: oft.Numpy.Placeholder(input.shape, dtype=value_type), + indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), + like_def: oft.Numpy.Placeholder(like.shape, dtype=value_type), + ) -> oft.Numpy: + with flow.scope.placement(device_type, "0:0"): + x_var = flow.get_variable( + "input", + shape=input.shape, + dtype=value_type, + initializer=flow.constant_initializer(0), + ) + x_var = flow.cast_to_current_logical_view(x_var) + x = x_var + params_def + + y = flow.dim_scatter_add_like(dim, indices_def, x, like_def) + + with flow.scope.placement(device_type, "0:0"): + flow.optimizer.SGD( + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + ).minimize(y) + + flow.watch_diff(x, _compare_diff) + return y + + return scatter_add_like_fn + +def _compare_dim_scatter_add_like_with_samples( + test_case, device_type, sample, value_type, index_type, machine_ids, device_count +): + scatter_add_like_fn = _make_dim_scatter_add_like_fn( + test_case, + sample["input"].astype(value_type[0]), + sample["index"].astype(index_type[0]), + sample["dim"], + sample["like"], + sample["grad"].astype(value_type[0]), + device_type, + value_type[1], + index_type[1], + machine_ids, + device_count, + ) + y = scatter_add_like_fn( + sample["input"].astype(value_type[0]), + sample["index"].astype(index_type[0]), + sample["like"].astype(value_type[0]), + ) + y.astype(value_type[0]) + + if value_type == flow.float16: + test_case.assertTrue( + np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) + ) + else: + test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) + +@flow.unittest.skip_unless_1n1d() +class TestDimGather1n1d(flow.unittest.TestCase): + def test_dim_scatter_add_like_float_cpu(test_case): + arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_add_like_with_samples(test_case, *arg) + +if __name__ == "__main__": + unittest.main() From 29b6774b64a354e4edf2254317a4f4752ab68772 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 28 Dec 2020 22:08:07 +0800 Subject: [PATCH 19/82] 1n2d test case for dim_scatter_add_like --- .../python/test/ops/test_dim_scatter_add.py | 68 +++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_add.py b/oneflow/python/test/ops/test_dim_scatter_add.py index 12d1b4b4a3e..e027d07ef86 100644 --- a/oneflow/python/test/ops/test_dim_scatter_add.py +++ b/oneflow/python/test/ops/test_dim_scatter_add.py @@ -22,6 +22,8 @@ import os +flow.config.enable_debug_mode(True) + def gen_scatter_add_like_test_sample( input_shape, index_shape, dim, like_shape, is_float=True ): @@ -73,7 +75,9 @@ def _gen_arg_dict( arg_dict = OrderedDict() arg_dict["device_type"] = [device_type] arg_dict["samples"] = [] - arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 1, (4, 4))) + arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 1, (4, 4), value_type=="float")) + #arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 0, (4, 4), value_type=="float")) + #arg_dict["samples"].append(gen_scatter_add_like_test_sample((4, 3, 3), (4, 3, 3), 0, (5, 5, 5), value_type=="float")) if value_type == "float": arg_dict["value_type"] = [ (np.float32, flow.float32), @@ -151,6 +155,37 @@ def scatter_add_like_fn( return scatter_add_like_fn + if value_type == flow.int32: + @flow.global_function(type="train", function_config=func_config) + def scatter_add_like_fn( + params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32), + indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), + like_def: oft.Numpy.Placeholder(like.shape, dtype=flow.float32), + ) -> oft.Numpy: + with flow.scope.placement(device_type, "0:0"): + x_var = flow.get_variable( + "input", + shape=params_def.shape, + dtype=flow.float32, + initializer=flow.constant_initializer(0), + ) + x_var = flow.cast_to_current_logical_view(x_var) + x = x_var + params_def + + x_int32 = flow.cast(x, dtype=flow.int32) + y_int32 = flow.dim_scatter_add_like(dim, indices_def, x_int32, like_def) + y_fp32 = flow.cast(y_int32, dtype=flow.int32) + + with flow.scope.placement(device_type, "0:0"): + flow.optimizer.SGD( + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + ).minimize(y_fp32) + + flow.watch_diff(x, _compare_diff) + return y_fp32 + + return scatter_add_like_fn + def _compare_dim_scatter_add_like_with_samples( test_case, device_type, sample, value_type, index_type, machine_ids, device_count ): @@ -159,7 +194,7 @@ def _compare_dim_scatter_add_like_with_samples( sample["input"].astype(value_type[0]), sample["index"].astype(index_type[0]), sample["dim"], - sample["like"], + sample["like"].astype(value_type[0]), sample["grad"].astype(value_type[0]), device_type, value_type[1], @@ -179,14 +214,39 @@ def _compare_dim_scatter_add_like_with_samples( np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) ) else: - test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) + test_case.assertTrue(np.allclose(y.astype(value_type[0]), sample["output"].astype(value_type[0]))) @flow.unittest.skip_unless_1n1d() -class TestDimGather1n1d(flow.unittest.TestCase): +class TestDimScatterAddLike1n1d(flow.unittest.TestCase): + def test_dim_scatter_add_like_int_cpu(test_case): + arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_add_like_with_samples(test_case, *arg) + def test_dim_scatter_add_like_float_cpu(test_case): arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1) for arg in GenArgList(arg_dict): _compare_dim_scatter_add_like_with_samples(test_case, *arg) + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_int_gpu(test_case): + arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_add_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_float_gpu(test_case): + arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_add_like_with_samples(test_case, *arg) + +@flow.unittest.skip_unless_1n2d() +class TestDimScatterAddLike1n2d(flow.unittest.TestCase): + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_float(test_case): + arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_add_like_with_samples(test_case, *arg) + if __name__ == "__main__": unittest.main() From 57dca96ca088c63d80808d90b1f29bbf76030d6b Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 28 Dec 2020 22:50:17 +0800 Subject: [PATCH 20/82] refine scatter sbp --- oneflow/user/ops/dim_scatter_ops.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index ea618590062..6b48cdcdf96 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -98,7 +98,8 @@ Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { return Maybe::Ok(); } -Maybe SetSbp(user_op::SbpContext* ctx) { +void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) +{ const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); int64_t index_num_axes = index_tensor.shape().NumAxes(); const int32_t dim = ctx->Attr("dim"); @@ -109,8 +110,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .Split(user_op::OpArg("index", 0), i) .Split(user_op::OpArg("input", 0), i) .Split(user_op::OpArg("output", 0), i) - .Split(user_op::OpArg("like", 0), i) - .Split(user_op::OpArg("src", 0), i) + .Split(user_op::OpArg(like_or_src, 0), i) .Build(); } } @@ -119,16 +119,24 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .PartialSum(user_op::OpArg("input", 0)) .Broadcast(user_op::OpArg("index", 0)) .PartialSum(user_op::OpArg("output", 0)) - .PartialSum(user_op::OpArg("like", 0)) - .PartialSum(user_op::OpArg("src", 0)) - .Build(); + .PartialSum(user_op::OpArg(like_or_src, 0)) + .Build(); +} + +Maybe SetSbpLike(user_op::SbpContext* ctx) { + _SetSbp(ctx, "like"); + return Maybe::Ok(); +} + +Maybe SetSbpInplace(user_op::SbpContext* ctx) { + _SetSbp(ctx, "src"); return Maybe::Ok(); } } // namespace #define REGISTER_SCATTER_LIKE_OP(optypename) \ REGISTER_USER_OP(optypename) \ - .OptionalInput("like") \ + .Input("like") \ .Input("input") \ .Input("index") \ .Output("output") \ @@ -136,7 +144,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ .SetBatchAxisInferFn(InferBatchAxis) \ - .SetGetSbpFn(SetSbp) + .SetGetSbpFn(SetSbpLike) #define REGISTER_SCATTER_INPLACE_OP(optypename) \ REGISTER_USER_OP(optypename) \ @@ -148,7 +156,7 @@ Maybe SetSbp(user_op::SbpContext* ctx) { .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InplaceInputArgModifierFn) \ .SetBatchAxisInferFn(InferBatchAxis) \ - .SetGetSbpFn(SetSbp) + .SetGetSbpFn(SetSbpInplace) #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ From 005c97ee68ef44f216bf762f0d22941dbd02b94f Mon Sep 17 00:00:00 2001 From: doombeaker Date: Mon, 28 Dec 2020 22:50:42 +0800 Subject: [PATCH 21/82] fail to sccater_add_like on 1n2d --- oneflow/python/test/ops/test_dim_scatter_add.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_add.py b/oneflow/python/test/ops/test_dim_scatter_add.py index e027d07ef86..e81cb3602a0 100644 --- a/oneflow/python/test/ops/test_dim_scatter_add.py +++ b/oneflow/python/test/ops/test_dim_scatter_add.py @@ -214,7 +214,10 @@ def _compare_dim_scatter_add_like_with_samples( np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) ) else: - test_case.assertTrue(np.allclose(y.astype(value_type[0]), sample["output"].astype(value_type[0]))) + print("oneflow:", y) + print("numpy:", sample["output"]) + print((y - sample["output"])<1e-3) + test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) @flow.unittest.skip_unless_1n1d() class TestDimScatterAddLike1n1d(flow.unittest.TestCase): @@ -244,7 +247,7 @@ def test_dim_scatter_add_like_float_gpu(test_case): class TestDimScatterAddLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): - arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2) + arg_dict = _gen_arg_dict("cpu", "float", "0:0-1", 2) for arg in GenArgList(arg_dict): _compare_dim_scatter_add_like_with_samples(test_case, *arg) From bbb99bcbcdc68b105c4c0e5ec86f494538eb46c1 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 10:29:17 +0800 Subject: [PATCH 22/82] refing sbp --- oneflow/user/ops/dim_scatter_ops.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 6b48cdcdf96..4558735c8ab 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -100,21 +100,6 @@ Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { - const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); - int64_t index_num_axes = index_tensor.shape().NumAxes(); - const int32_t dim = ctx->Attr("dim"); - - FOR_RANGE(int64_t, i, 0, index_num_axes) { - if (i != dim) { - ctx->NewBuilder() - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) - .Split(user_op::OpArg("output", 0), i) - .Split(user_op::OpArg(like_or_src, 0), i) - .Build(); - } - } - ctx->NewBuilder() .PartialSum(user_op::OpArg("input", 0)) .Broadcast(user_op::OpArg("index", 0)) From 982bfb8c8cc0067f8ee8f4e3235674c85ceb97e4 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 10:32:55 +0800 Subject: [PATCH 23/82] refine test case, unify add and update like ops --- .../{test_dim_scatter_add.py => test_dim_scatter_op_like.py} | 3 --- 1 file changed, 3 deletions(-) rename oneflow/python/test/ops/{test_dim_scatter_add.py => test_dim_scatter_op_like.py} (98%) diff --git a/oneflow/python/test/ops/test_dim_scatter_add.py b/oneflow/python/test/ops/test_dim_scatter_op_like.py similarity index 98% rename from oneflow/python/test/ops/test_dim_scatter_add.py rename to oneflow/python/test/ops/test_dim_scatter_op_like.py index e81cb3602a0..5fda5c8a798 100644 --- a/oneflow/python/test/ops/test_dim_scatter_add.py +++ b/oneflow/python/test/ops/test_dim_scatter_op_like.py @@ -214,9 +214,6 @@ def _compare_dim_scatter_add_like_with_samples( np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) ) else: - print("oneflow:", y) - print("numpy:", sample["output"]) - print((y - sample["output"])<1e-3) test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) @flow.unittest.skip_unless_1n1d() From f712efb19711bff3ef09a9eb48207d7b8a65ec84 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 11:17:29 +0800 Subject: [PATCH 24/82] test case for scatter_add/update like ops finished --- .../test/ops/test_dim_scatter_op_like.py | 136 ++++++++++++++---- oneflow/user/ops/dim_scatter_ops.cpp | 7 +- 2 files changed, 112 insertions(+), 31 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_op_like.py b/oneflow/python/test/ops/test_dim_scatter_op_like.py index 5fda5c8a798..0c85164ae9e 100644 --- a/oneflow/python/test/ops/test_dim_scatter_op_like.py +++ b/oneflow/python/test/ops/test_dim_scatter_op_like.py @@ -24,8 +24,17 @@ flow.config.enable_debug_mode(True) -def gen_scatter_add_like_test_sample( - input_shape, index_shape, dim, like_shape, is_float=True + +def _bin_add(out_val, in_value): + return out_val + in_value + + +def _bin_update(out_val, in_value): + return in_value + + +def gen_scatter_like_test_sample( + input_shape, index_shape, dim, like_shape, is_float=True, binop=_bin_add ): def _np_dim_scatter_add_like(input, dim, index, like): output = np.zeros(like.shape) @@ -34,11 +43,13 @@ def _np_dim_scatter_add_like(input, dim, index, like): outcoord = [*outcoord] outcoord[dim] = index[np.unravel_index(inputidx, index.shape)] output_offset = np.ravel_multi_index(outcoord, like_shape) - output[np.unravel_index(output_offset, like_shape)] += input[ - np.unravel_index(inputidx, input.shape) - ] + output[np.unravel_index(output_offset, like_shape)] = binop( + output[np.unravel_index(output_offset, like_shape)], + input[np.unravel_index(inputidx, input.shape)], + ) + return output - + if is_float: input = np.random.random(input_shape) like = np.random.random(like_shape) @@ -50,7 +61,7 @@ def _np_dim_gather(dim, input, index): output = np.zeros(index.shape) for idx in range(0, index.size): incoord = np.unravel_index(idx, index.shape) - outcoord=[*incoord] + outcoord = [*incoord] incoord = [*incoord] incoord[dim] = index[np.unravel_index(idx, index.shape)] output[tuple(outcoord)] = input[tuple(incoord)] @@ -66,18 +77,28 @@ def _np_dim_gather(dim, input, index): "like": like, "dim": dim, "output": output, - "grad": grad + "grad": grad, } + def _gen_arg_dict( - device_type="gpu", value_type="float", machine_ids="0:0", device_count=1 + device_type="gpu", + value_type="float", + machine_ids="0:0", + device_count=1, + binop=_bin_add, + dim_scatter_op=flow.dim_scatter_add_like, ): arg_dict = OrderedDict() arg_dict["device_type"] = [device_type] arg_dict["samples"] = [] - arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 1, (4, 4), value_type=="float")) - #arg_dict["samples"].append(gen_scatter_add_like_test_sample((2, 2), (2, 2), 0, (4, 4), value_type=="float")) - #arg_dict["samples"].append(gen_scatter_add_like_test_sample((4, 3, 3), (4, 3, 3), 0, (5, 5, 5), value_type=="float")) + arg_dict["samples"].append( + gen_scatter_like_test_sample( + (2, 2), (2, 2), 1, (4, 4), is_float=value_type == "float", binop=binop + ) + ) + # arg_dict["samples"].append(gen_scatter_like_test_sample((2, 2), (2, 2), 0, (4, 4), value_type=="float")) + # arg_dict["samples"].append(gen_scatter_like_test_sample((4, 3, 3), (4, 3, 3), 0, (5, 5, 5), value_type=="float")) if value_type == "float": arg_dict["value_type"] = [ (np.float32, flow.float32), @@ -90,8 +111,10 @@ def _gen_arg_dict( arg_dict["index_type"] = [(np.int32, flow.int32)] arg_dict["machine_ids"] = [machine_ids] arg_dict["device_count"] = [device_count] + arg_dict["flow_scatter_op"] = [dim_scatter_op] return arg_dict + def _make_dim_scatter_add_like_fn( test_case, input, @@ -104,6 +127,7 @@ def _make_dim_scatter_add_like_fn( index_type, machine_ids, device_counts, + flow_scatter_op, ): flow.clear_default_session() if device_type == "cpu": @@ -143,7 +167,7 @@ def scatter_add_like_fn( x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def - y = flow.dim_scatter_add_like(dim, indices_def, x, like_def) + y = flow_scatter_op(dim, indices_def, x, like_def) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( @@ -156,6 +180,7 @@ def scatter_add_like_fn( return scatter_add_like_fn if value_type == flow.int32: + @flow.global_function(type="train", function_config=func_config) def scatter_add_like_fn( params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32), @@ -173,7 +198,7 @@ def scatter_add_like_fn( x = x_var + params_def x_int32 = flow.cast(x, dtype=flow.int32) - y_int32 = flow.dim_scatter_add_like(dim, indices_def, x_int32, like_def) + y_int32 = flow_scatter_op(dim, indices_def, x_int32, like_def) y_fp32 = flow.cast(y_int32, dtype=flow.int32) with flow.scope.placement(device_type, "0:0"): @@ -186,8 +211,16 @@ def scatter_add_like_fn( return scatter_add_like_fn -def _compare_dim_scatter_add_like_with_samples( - test_case, device_type, sample, value_type, index_type, machine_ids, device_count + +def _compare_dim_scatter_op_like_with_samples( + test_case, + device_type, + sample, + value_type, + index_type, + machine_ids, + device_count, + flow_scatter_op, ): scatter_add_like_fn = _make_dim_scatter_add_like_fn( test_case, @@ -201,9 +234,10 @@ def _compare_dim_scatter_add_like_with_samples( index_type[1], machine_ids, device_count, + flow_scatter_op, ) y = scatter_add_like_fn( - sample["input"].astype(value_type[0]), + sample["input"].astype(value_type[0]), sample["index"].astype(index_type[0]), sample["like"].astype(value_type[0]), ) @@ -216,37 +250,85 @@ def _compare_dim_scatter_add_like_with_samples( else: test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) + @flow.unittest.skip_unless_1n1d() class TestDimScatterAddLike1n1d(flow.unittest.TestCase): def test_dim_scatter_add_like_int_cpu(test_case): - arg_dict = _gen_arg_dict("cpu", "int", "0:0", 1) + arg_dict = _gen_arg_dict( + "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like + ) for arg in GenArgList(arg_dict): - _compare_dim_scatter_add_like_with_samples(test_case, *arg) + _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_add_like_float_cpu(test_case): - arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1) + arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like) for arg in GenArgList(arg_dict): - _compare_dim_scatter_add_like_with_samples(test_case, *arg) + _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_int_gpu(test_case): - arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1) + arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like) for arg in GenArgList(arg_dict): - _compare_dim_scatter_add_like_with_samples(test_case, *arg) + _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float_gpu(test_case): - arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1) + arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + +@flow.unittest.skip_unless_1n1d() +class TestDimScatterUpdateLike1n1d(flow.unittest.TestCase): + def test_dim_scatter_update_like_int_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update_like + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + def test_dim_scatter_update_like_float_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update_like + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_int_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update_like + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_float_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update_like + ) for arg in GenArgList(arg_dict): - _compare_dim_scatter_add_like_with_samples(test_case, *arg) + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + @flow.unittest.skip_unless_1n2d() class TestDimScatterAddLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): - arg_dict = _gen_arg_dict("cpu", "float", "0:0-1", 2) + arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add_like) for arg in GenArgList(arg_dict): - _compare_dim_scatter_add_like_with_samples(test_case, *arg) + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + +@flow.unittest.skip_unless_1n2d() +class TestDimScatterUpdateLike1n2d(flow.unittest.TestCase): + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_float(test_case): + arg_dict = _gen_arg_dict( + "gpu", "float", "0:0-1", 2, _bin_update, flow.dim_scatter_update_like + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + if __name__ == "__main__": unittest.main() diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 4558735c8ab..796683dc0e4 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -98,14 +98,13 @@ Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { return Maybe::Ok(); } -void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) -{ +void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { ctx->NewBuilder() .PartialSum(user_op::OpArg("input", 0)) .Broadcast(user_op::OpArg("index", 0)) .PartialSum(user_op::OpArg("output", 0)) .PartialSum(user_op::OpArg(like_or_src, 0)) - .Build(); + .Build(); } Maybe SetSbpLike(user_op::SbpContext* ctx) { @@ -121,7 +120,7 @@ Maybe SetSbpInplace(user_op::SbpContext* ctx) { #define REGISTER_SCATTER_LIKE_OP(optypename) \ REGISTER_USER_OP(optypename) \ - .Input("like") \ + .Input("like") \ .Input("input") \ .Input("index") \ .Output("output") \ From 0c64679ee71168bd1a0322ad7966eb61cbbbd68f Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 14:15:49 +0800 Subject: [PATCH 25/82] test cases for scatter ops --- ...ter_op_like.py => test_dim_scatter_ops.py} | 185 ++++++++++++++++-- 1 file changed, 170 insertions(+), 15 deletions(-) rename oneflow/python/test/ops/{test_dim_scatter_op_like.py => test_dim_scatter_ops.py} (66%) diff --git a/oneflow/python/test/ops/test_dim_scatter_op_like.py b/oneflow/python/test/ops/test_dim_scatter_ops.py similarity index 66% rename from oneflow/python/test/ops/test_dim_scatter_op_like.py rename to oneflow/python/test/ops/test_dim_scatter_ops.py index 0c85164ae9e..59fc9522810 100644 --- a/oneflow/python/test/ops/test_dim_scatter_op_like.py +++ b/oneflow/python/test/ops/test_dim_scatter_ops.py @@ -34,10 +34,20 @@ def _bin_update(out_val, in_value): def gen_scatter_like_test_sample( - input_shape, index_shape, dim, like_shape, is_float=True, binop=_bin_add + input_shape, + index_shape, + dim, + like_shape, + is_float=True, + binop=_bin_add, + inplace=True, ): def _np_dim_scatter_add_like(input, dim, index, like): - output = np.zeros(like.shape) + if inplace: + output = like.copy() + else: + output = np.zeros(like.shape) + for inputidx in range(0, input.size): outcoord = np.unravel_index(inputidx, input.shape) outcoord = [*outcoord] @@ -88,13 +98,20 @@ def _gen_arg_dict( device_count=1, binop=_bin_add, dim_scatter_op=flow.dim_scatter_add_like, + inplace=True, ): arg_dict = OrderedDict() arg_dict["device_type"] = [device_type] arg_dict["samples"] = [] arg_dict["samples"].append( gen_scatter_like_test_sample( - (2, 2), (2, 2), 1, (4, 4), is_float=value_type == "float", binop=binop + (2, 2), + (2, 2), + 1, + (2, 2), + is_float=value_type == "float", + binop=binop, + inplace=inplace, ) ) # arg_dict["samples"].append(gen_scatter_like_test_sample((2, 2), (2, 2), 0, (4, 4), value_type=="float")) @@ -198,7 +215,8 @@ def scatter_add_like_fn( x = x_var + params_def x_int32 = flow.cast(x, dtype=flow.int32) - y_int32 = flow_scatter_op(dim, indices_def, x_int32, like_def) + like_def_int32 = flow.cast(like_def, dtype=flow.int32) + y_int32 = flow_scatter_op(dim, indices_def, x_int32, like_def_int32) y_fp32 = flow.cast(y_int32, dtype=flow.int32) with flow.scope.placement(device_type, "0:0"): @@ -251,44 +269,173 @@ def _compare_dim_scatter_op_like_with_samples( test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) +# ----scatter like ops test---- @flow.unittest.skip_unless_1n1d() -class TestDimScatterAddLike1n1d(flow.unittest.TestCase): +class TestDimScatterOpsLike1n1d(flow.unittest.TestCase): def test_dim_scatter_add_like_int_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like + "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_add_like_float_cpu(test_case): - arg_dict = _gen_arg_dict("cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like) + arg_dict = _gen_arg_dict( + "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_int_gpu(test_case): - arg_dict = _gen_arg_dict("gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like) + arg_dict = _gen_arg_dict( + "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float_gpu(test_case): - arg_dict = _gen_arg_dict("gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like) + arg_dict = _gen_arg_dict( + "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + def test_dim_scatter_update_like_int_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", + "int", + "0:0", + 1, + _bin_update, + flow.dim_scatter_update_like, + inplace=False, + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + def test_dim_scatter_update_like_float_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", + "float", + "0:0", + 1, + _bin_update, + flow.dim_scatter_update_like, + inplace=False, + ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_int_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", + "int", + "0:0", + 1, + _bin_update, + flow.dim_scatter_update_like, + inplace=False, + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_float_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", + "float", + "0:0", + 1, + _bin_update, + flow.dim_scatter_update_like, + inplace=False, + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + +@flow.unittest.skip_unless_1n2d() +class TestDimScatterOpsLike1n2d(flow.unittest.TestCase): + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_float(test_case): + arg_dict = _gen_arg_dict( + "gpu", + "float", + "0:0-1", + 2, + _bin_add, + flow.dim_scatter_add_like, + inplace=False, + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_update_like_float(test_case): + arg_dict = _gen_arg_dict( + "gpu", + "float", + "0:0-1", + 2, + _bin_update, + flow.dim_scatter_update_like, + inplace=False, + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + +# ----scatter like ops test END---- + +# ----scatter inplace ops test---- +@flow.unittest.skip_unless_1n1d() +class TestDimScatterAddInplace1n1d(flow.unittest.TestCase): + def test_dim_scatter_add_int_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + def test_dim_scatter_add_float_cpu(test_case): + arg_dict = _gen_arg_dict( + "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_int_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_like_float_gpu(test_case): + arg_dict = _gen_arg_dict( + "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @flow.unittest.skip_unless_1n1d() class TestDimScatterUpdateLike1n1d(flow.unittest.TestCase): def test_dim_scatter_update_like_int_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update_like + "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_update_like_float_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update_like + "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -296,7 +443,7 @@ def test_dim_scatter_update_like_float_cpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_int_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update_like + "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -304,7 +451,7 @@ def test_dim_scatter_update_like_int_gpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update_like + "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -314,7 +461,9 @@ def test_dim_scatter_update_like_float_gpu(test_case): class TestDimScatterAddLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): - arg_dict = _gen_arg_dict("gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add_like) + arg_dict = _gen_arg_dict( + "gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add, inplace=True + ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -324,7 +473,13 @@ class TestDimScatterUpdateLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0-1", 2, _bin_update, flow.dim_scatter_update_like + "gpu", + "float", + "0:0-1", + 2, + _bin_update, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) From d4da91b4d0a130bc39c6828a69f3ce27ea8f597e Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 14:28:14 +0800 Subject: [PATCH 26/82] refine, merge test class --- oneflow/python/test/ops/test_dim_scatter_ops.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_ops.py b/oneflow/python/test/ops/test_dim_scatter_ops.py index 59fc9522810..4c34e5bb9fb 100644 --- a/oneflow/python/test/ops/test_dim_scatter_ops.py +++ b/oneflow/python/test/ops/test_dim_scatter_ops.py @@ -269,7 +269,6 @@ def _compare_dim_scatter_op_like_with_samples( test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) -# ----scatter like ops test---- @flow.unittest.skip_unless_1n1d() class TestDimScatterOpsLike1n1d(flow.unittest.TestCase): def test_dim_scatter_add_like_int_cpu(test_case): @@ -388,11 +387,8 @@ def test_dim_scatter_update_like_float(test_case): _compare_dim_scatter_op_like_with_samples(test_case, *arg) -# ----scatter like ops test END---- - -# ----scatter inplace ops test---- @flow.unittest.skip_unless_1n1d() -class TestDimScatterAddInplace1n1d(flow.unittest.TestCase): +class TestDimScatterOpsInplace1n1d(flow.unittest.TestCase): def test_dim_scatter_add_int_cpu(test_case): arg_dict = _gen_arg_dict( "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True @@ -423,9 +419,6 @@ def test_dim_scatter_add_like_float_gpu(test_case): for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) - -@flow.unittest.skip_unless_1n1d() -class TestDimScatterUpdateLike1n1d(flow.unittest.TestCase): def test_dim_scatter_update_like_int_cpu(test_case): arg_dict = _gen_arg_dict( "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True @@ -458,7 +451,7 @@ def test_dim_scatter_update_like_float_gpu(test_case): @flow.unittest.skip_unless_1n2d() -class TestDimScatterAddLike1n2d(flow.unittest.TestCase): +class TestDimScatterOpsInplace1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): arg_dict = _gen_arg_dict( @@ -467,9 +460,6 @@ def test_dim_scatter_add_like_float(test_case): for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) - -@flow.unittest.skip_unless_1n2d() -class TestDimScatterUpdateLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float(test_case): arg_dict = _gen_arg_dict( From 56e309b944bbc15013985e5558fbee60040a8d80 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Tue, 29 Dec 2020 15:15:23 +0800 Subject: [PATCH 27/82] startup of api docs --- oneflow/python/ops/array_ops.py | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 26b0f1ef712..10ec7bb3d1d 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2322,6 +2322,44 @@ def dim_scatter_update_like( like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: + r""" desc: ... `input` according to `index` along with the axis `dim`. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + output[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 + output[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 + output[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 + + + The shape of `input` and `index` should be the same except in the `dim` dimension. + + That is, if `input` is a n-dimension blob with shape :math:`(x_0, x_1, \dots, x_{i-1}, x_i, x_{i+1}, \dots, x_n)`, + and `dim = i`, then `index` must be a n-dimension blob with shape :math:`(x_0, x_1, \dots, x_{i-1}, k, x_{i+1}, \dots, x_n)` + where :math:`k \geq 1`. + + The return Blob `output` will have the same shape with `index`. + + Args: + dim (int): The axis along which to index + index (remote_blob_util.BlobDef): The index blob of elements to scatter + input (remote_blob_util.BlobDef): The input blob whose elments will be scatterd and updated to output. + like (remote_blob_util.BlobDef): The like blob. The shape size of output will be same as like blob. + name (Optional[str], optional): The name of the operation. Defaults to None. + + Returns: + remote_blob_util.BlobDef: The elements scattered from `input` will be returned as the output Blob. + + For example: + + .. code-block:: python + + import oneflow as flow + #... + + """ + return ( flow.user_op_builder( name if name is not None else id_util.UniqueStr("DimScatterUpdateLike_") @@ -2396,7 +2434,7 @@ def dim_scatter_add( ) -> remote_blob_util.BlobDef: return ( flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatterAddLike_") + name if name is not None else id_util.UniqueStr("DimScatterAdd_") ) .Op("dim_scatter_add") .Input("input", [input]) From 6401f4d356f7985f73d3720ca0df8a52c3a9c732 Mon Sep 17 00:00:00 2001 From: doombeaker Date: Wed, 30 Dec 2020 10:47:41 +0800 Subject: [PATCH 28/82] add scatter api docs and assertion in python --- oneflow/python/ops/array_ops.py | 82 +++++++++++++++---- .../python/test/ops/test_dim_scatter_ops.py | 4 +- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 10ec7bb3d1d..1fc8cd09cb0 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2314,37 +2314,56 @@ def amp_white_identity( return op.InferAndTryRun().SoleOutputBlob() +def _check_scatter_blobs(input, dim, index, like_or_src): + assert dim < len(index.shape), ValueError( + "Value of dim is out of range(dim should be less than len(index.shape))" + ) + + assert len(input.shape) == len(index.shape) and len(input.shape) == len( + like_or_src.shape + ), ValueError("Number of dimensions of input, index and like/src should equal") + + for i in range(0, len(input.shape)): + assert input.shape[i] == index.shape[i], ValueError( + "Shape of input and index should be same" + ) + assert input.shape[i] <= like_or_src.shape[i], ValueError( + "Shape like/src blob should be larger than input" + ) + + @oneflow_export("dim_scatter_update_like") def dim_scatter_update_like( + input: remote_blob_util.BlobDef, dim: int, index: remote_blob_util.BlobDef, - input: remote_blob_util.BlobDef, like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: - r""" desc: ... `input` according to `index` along with the axis `dim`. + r"""This operator writes the elements specified by `index` along with the axis + `dim` from the `input` into the output blob whose shape size specified by `like`. Take a 3-D blob as example, the output is specified by: .. code-block:: python - output[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 - output[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 - output[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 - + output[index[i][j][k]][j][k] = input[i][j][k] # if dim == 0 + output[i][index[i][j][k]][k] = input[i][j][k] # if dim == 1 + output[i][j][index[i][j][k]] = input[i][j][k] # if dim == 2 - The shape of `input` and `index` should be the same except in the `dim` dimension. + `input`, `index` and `like` should have same number of dimensions. + The shape of `input` and `index` should be the same. + It is also required that sizes at dimension `d` of `index` and `like` should be equal + for all dimensions d != dim. - That is, if `input` is a n-dimension blob with shape :math:`(x_0, x_1, \dots, x_{i-1}, x_i, x_{i+1}, \dots, x_n)`, - and `dim = i`, then `index` must be a n-dimension blob with shape :math:`(x_0, x_1, \dots, x_{i-1}, k, x_{i+1}, \dots, x_n)` - where :math:`k \geq 1`. - - The return Blob `output` will have the same shape with `index`. + Moreover, the values of index must not exceeds the range of like blob at dim dimension. + + The return Blob `output` will have the same shape with `like`. Args: + input (remote_blob_util.BlobDef): The input blob whose elments will be scatterd and updated to output. dim (int): The axis along which to index index (remote_blob_util.BlobDef): The index blob of elements to scatter - input (remote_blob_util.BlobDef): The input blob whose elments will be scatterd and updated to output. like (remote_blob_util.BlobDef): The like blob. The shape size of output will be same as like blob. name (Optional[str], optional): The name of the operation. Defaults to None. @@ -2356,9 +2375,36 @@ def dim_scatter_update_like( .. code-block:: python import oneflow as flow - #... + import numpy as np + import oneflow.typing as tp + + + @flow.global_function() + def dim_scatter_update_Job( + input: tp.Numpy.Placeholder((2, 2), dtype=flow.float64), + index: tp.Numpy.Placeholder((2, 2), dtype=flow.int32), + ) -> tp.Numpy: + like_blob = flow.get_variable( + "like_blob", + (3, 3), + dtype=flow.float64, + initializer=flow.constant_initializer(0), + ) + return flow.dim_scatter_update_like(input, 1, index, like_blob) + - """ + input = np.array([[1, 2], [3, 4]]).astype(np.float64) + index = np.array([[1, 0], [1, 2]]).astype(np.int32) + + out = dim_scatter_update_Job(input, index) + print(out) + # output + # [[2. 1. 0.] + # [0. 3. 4.] + # [0. 0. 0.]] + + """ + _check_scatter_blobs(input, dim, index, like) return ( flow.user_op_builder( @@ -2378,9 +2424,9 @@ def dim_scatter_update_like( @oneflow_export("dim_scatter_update") def dim_scatter_update( + input: remote_blob_util.BlobDef, dim: int, index: remote_blob_util.BlobDef, - input: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: @@ -2402,9 +2448,9 @@ def dim_scatter_update( @oneflow_export("dim_scatter_add_like") def dim_scatter_add_like( + input: remote_blob_util.BlobDef, dim: int, index: remote_blob_util.BlobDef, - input: remote_blob_util.BlobDef, like: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: @@ -2426,9 +2472,9 @@ def dim_scatter_add_like( @oneflow_export("dim_scatter_add") def dim_scatter_add( + input: remote_blob_util.BlobDef, dim: int, index: remote_blob_util.BlobDef, - input: remote_blob_util.BlobDef, src: remote_blob_util.BlobDef, name: Optional[str] = None, ) -> remote_blob_util.BlobDef: diff --git a/oneflow/python/test/ops/test_dim_scatter_ops.py b/oneflow/python/test/ops/test_dim_scatter_ops.py index 4c34e5bb9fb..4facfcd9440 100644 --- a/oneflow/python/test/ops/test_dim_scatter_ops.py +++ b/oneflow/python/test/ops/test_dim_scatter_ops.py @@ -184,7 +184,7 @@ def scatter_add_like_fn( x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def - y = flow_scatter_op(dim, indices_def, x, like_def) + y = flow_scatter_op(x, dim, indices_def, like_def) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( @@ -216,7 +216,7 @@ def scatter_add_like_fn( x_int32 = flow.cast(x, dtype=flow.int32) like_def_int32 = flow.cast(like_def, dtype=flow.int32) - y_int32 = flow_scatter_op(dim, indices_def, x_int32, like_def_int32) + y_int32 = flow_scatter_op(x_int32, dim, indices_def, like_def_int32) y_fp32 = flow.cast(y_int32, dtype=flow.int32) with flow.scope.placement(device_type, "0:0"): From 547adde061ca12e4ae7c665f13aa5b0f169915b4 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Fri, 2 Jul 2021 19:39:22 +0800 Subject: [PATCH 29/82] fix make error but still segment fault --- .../user/kernels/dim_gather_kernel_util.cu | 2 +- .../user/kernels/dim_gather_scatter_util.h | 8 +++++- .../user/kernels/dim_scatter_kernel_util.cu | 2 +- oneflow/user/ops/dim_gather_op.cpp | 25 +++++++++---------- oneflow/user/ops/dim_scatter_ops.cpp | 24 ++++++++++-------- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu index 5b9e818ed99..e6d88cb69e1 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cu +++ b/oneflow/user/kernels/dim_gather_kernel_util.cu @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef WITH_CUDA -#include "oneflow/core/kernel/util/cuda_kernel_util.h" +// #include "oneflow/core/kernel/util/cuda_kernel_util.h" #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 2f2395dda0e..c868acfc5b5 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -15,6 +15,11 @@ limitations under the License. */ #ifndef ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ + +#ifdef WITH_CUDA +#include "oneflow/core/cuda/atomic.cuh" +#endif // WITH_CUDA + #include "oneflow/core/ndarray/xpu_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/framework/framework.h" @@ -37,7 +42,8 @@ template struct DeviceBinOp { OF_DEVICE_FUNC static void Add(const T* x, T* y) { #ifdef __CUDA_ARCH__ - gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half + // gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half + cuda::atomic::Add(y, *x); #else *y += *x; #endif diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index f2df4fbb082..bb58e41dc60 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef WITH_CUDA -#include "oneflow/core/kernel/util/cuda_kernel_util.h" +// #include "oneflow/core/kernel/util/cuda_kernel_util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index e2eebeb0f59..12cce2e3664 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -36,7 +36,7 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); // split_axs should NOT equals dim when in consistent view - const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + const cfg::SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); auto is_split = in_sbp.has_split_parallel(); if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { int64_t split_axis = in_sbp.split_parallel().axis(); @@ -66,17 +66,6 @@ void GatherInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn indices_modifier->set_requires_grad(false); } -Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { - OptInt64* indices_batch_axis = ctx->BatchAxis4ArgNameAndIndex("index", 0); - if (indices_batch_axis->has_value()) { - CHECK_GE_OR_RETURN(indices_batch_axis->value(), 0); - CHECK_LE_OR_RETURN( - indices_batch_axis->value(), - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0).shape().NumAxes() - 1); - } - *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *indices_batch_axis; - return Maybe::Ok(); -} Maybe BuildSbp(user_op::SbpContext* ctx) { const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); @@ -106,6 +95,16 @@ Maybe BuildSbp(user_op::SbpContext* ctx) { .Build(); return Maybe::Ok(); } + +Maybe InferDtype(user_op::InferContext* ctx) { + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); + *out->mut_data_type() = in->data_type(); + return Maybe::Ok(); +} + } // namespace REGISTER_USER_OP("dim_gather") @@ -115,7 +114,7 @@ REGISTER_USER_OP("dim_gather") .Attr("dim") .SetTensorDescInferFn(InferTensorDesc) .SetInputArgModifyFn(GatherInputArgModifierFn) - .SetBatchAxisInferFn(InferBatchAxis) + .SetDataTypeInferFn(InferDtype) .SetGetSbpFn(BuildSbp); REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 796683dc0e4..29d5409066c 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -32,7 +32,7 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int32_t dim = ctx->Attr("dim"); - const SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + const cfg::SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); int64_t split_axis = input_sbp.split_parallel().axis(); if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; @@ -70,7 +70,7 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); - like_arg_modifier->set_use_header_only(true); + // like_arg_modifier->set_use_header_only(true); like_arg_modifier->set_requires_grad(false); user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); @@ -91,13 +91,6 @@ Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgMo return Maybe::Ok(); } -Maybe InferBatchAxis(user_op::BatchAxisContext* ctx) { - CHECK_OR_RETURN(*ctx->BatchAxis4ArgNameAndIndex("index", 0) - == *ctx->BatchAxis4ArgNameAndIndex("input", 0)); - *ctx->BatchAxis4ArgNameAndIndex("output", 0) = *ctx->BatchAxis4ArgNameAndIndex("input", 0); - return Maybe::Ok(); -} - void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { ctx->NewBuilder() .PartialSum(user_op::OpArg("input", 0)) @@ -116,6 +109,15 @@ Maybe SetSbpInplace(user_op::SbpContext* ctx) { _SetSbp(ctx, "src"); return Maybe::Ok(); } + +Maybe InferDtype(user_op::InferContext* ctx) { + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); + *out->mut_data_type() = in->data_type(); + return Maybe::Ok(); +} } // namespace #define REGISTER_SCATTER_LIKE_OP(optypename) \ @@ -127,7 +129,7 @@ Maybe SetSbpInplace(user_op::SbpContext* ctx) { .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ - .SetBatchAxisInferFn(InferBatchAxis) \ + .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpLike) #define REGISTER_SCATTER_INPLACE_OP(optypename) \ @@ -139,7 +141,7 @@ Maybe SetSbpInplace(user_op::SbpContext* ctx) { .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InplaceInputArgModifierFn) \ - .SetBatchAxisInferFn(InferBatchAxis) \ + .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpInplace) #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ From e99396f59280392e261ae1fef12a1c6f297ce4a5 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Fri, 2 Jul 2021 20:49:46 +0800 Subject: [PATCH 30/82] annotate sbp infer --- oneflow/user/ops/dim_gather_op.cpp | 12 ++++++------ oneflow/user/ops/dim_scatter_ops.cpp | 17 +++++++++-------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 12cce2e3664..af22a182841 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -36,12 +36,12 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); // split_axs should NOT equals dim when in consistent view - const cfg::SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - auto is_split = in_sbp.has_split_parallel(); - if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { - int64_t split_axis = in_sbp.split_parallel().axis(); - CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - } + // const cfg::SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + // auto is_split = in_sbp.has_split_parallel(); + // if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { + // int64_t split_axis = in_sbp.split_parallel().axis(); + // CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; + // } CHECK_OR_RETURN(!in->is_dynamic()); CHECK_OR_RETURN(!index->is_dynamic()); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 29d5409066c..873ca6ed8c9 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -32,11 +32,11 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int32_t dim = ctx->Attr("dim"); - const cfg::SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - int64_t split_axis = input_sbp.split_parallel().axis(); - if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { - CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - } + // const cfg::SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); + // int64_t split_axis = input_sbp.split_parallel().axis(); + // if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { + // CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; + // } int64_t input_num_axes = input->shape().NumAxes(); CHECK_GT_OR_RETURN(input_num_axes, 0); @@ -55,9 +55,10 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { } CHECK_EQ_OR_RETURN(input_num_axes, output_num_axes); - FOR_RANGE(int64_t, i, 0, input_num_axes) { - CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); - } + // todo(zzk): it is not align with torch + // FOR_RANGE(int64_t, i, 0, input_num_axes) { + // CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); + // } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = src ? src->shape() : like->shape(); From 332501edf7a54091f359b97eb8e0e94bbe75d5d8 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 5 Jul 2021 18:01:42 +0800 Subject: [PATCH 31/82] rewrite scatter kernel logic --- .../user/kernels/dim_gather_scatter_util.h | 33 +++++++------ .../user/kernels/dim_scatter_kernel_util.h | 49 ++++++++++++++++--- oneflow/user/kernels/dim_scatter_kernels.cpp | 45 ++++++++++++----- oneflow/user/ops/dim_scatter_ops.cpp | 22 ++++++--- 4 files changed, 106 insertions(+), 43 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index c868acfc5b5..e0185da3cf1 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -56,7 +56,7 @@ struct DeviceBinOp { #define DECLARE_DIMSCATTER_FUNCTOR(binop) \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); \ } @@ -64,41 +64,42 @@ struct DeviceBinOp { #define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ - index, input, output, DeviceBinOp::binop); \ + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) { \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ + index, src, output, DeviceBinOp::binop); \ } \ } #define IMPLEMENT_DIMSCATTER_GPUFUNCTOR(binop) \ template \ - __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper input_nd_helper, \ + __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ + const DimOpIndexNdHelper idx_nd_helper, \ const DimOpIndexNdHelper output_nd_helper, \ int ndim, int64_t elem_cnt, int32_t dim, \ - const IDX_T* index, const IN_T* input, IN_T* output) { \ - DoDimScatterBinOp(input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ - input, output, DeviceBinOp::binop); \ + const IDX_T* index, const IN_T* src, IN_T* output) { \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + src, output, DeviceBinOp::binop); \ } \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, \ output); \ } \ }; \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const float16* input, float16* output) { \ + int32_t dim, const IDX_T* index, const float16* src, float16* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ - reinterpret_cast(input), reinterpret_cast(output)); \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + reinterpret_cast(src), reinterpret_cast(output)); \ } \ } diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index b502edf08c8..1c2e31344a3 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -16,7 +16,6 @@ limitations under the License. #ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #include "oneflow/user/kernels/dim_gather_scatter_util.h" - // Steps for adding a binary operation on scatter are as follows: // 1. implment binop in DeviceBinOp, for example "Mul": // OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } @@ -46,18 +45,54 @@ namespace user_op { DECLARE_DIMSCATTER_FUNCTOR(Add); DECLARE_DIMSCATTER_FUNCTOR(Update); +// template +// OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, +// const DimOpIndexNdHelper& output_nd_helper, int ndim, +// int64_t elem_cnt, int32_t dim, const IDX_T* index, +// const IN_T* src, IN_T* output, BinaryOpFn bin_op) { +// XPU_1D_KERNEL_LOOP(src_offset, elem_cnt) { +// IDX_T coordinate[kDimGatherMaxDimCount] = {0}; +// src_nd_helper.OffsetToNdIndex(src_offset, coordinate, ndim); +// coordinate[dim] = index[src_offset]; + +// IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); +// bin_op(src + src_offset, output + output_offset); +// } +// } + template -OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& input_nd_helper, +OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output, BinaryOpFn bin_op) { - XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { + const IN_T* src, IN_T* output, BinaryOpFn bin_op) { + XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { + // 感觉需要从index_offset算src_offset + // 是不是还需要一个idx_nd_helper? + + // an example IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); - coordinate[dim] = index[input_offset]; + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); + printf("idx offset is: %d \n", idx_offset); + IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); + printf("src offset is: %d \n", src_offset); + coordinate[dim] = index[idx_offset]; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - bin_op(input + input_offset, output + output_offset); + printf("output offset is: %d \n", output_offset); + + printf("src is: %f \n", *(src+src_offset)); + printf("output is: %f \n", *(output+output_offset)); + + bin_op(src + src_offset, output + output_offset); + // ======= finish ======== + + // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + // src_nd_helper.OffsetToNdIndex(src_offset, coordinate, ndim); + // coordinate[dim] = index[idx_offset]; + + // IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); + // bin_op(src + src_offset, output + output_offset); } } diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 42613ea083e..c2e5b8787d7 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -28,12 +28,12 @@ namespace user_op { ~DimScatter##binop##Kernel() override = default; \ \ private: \ - void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ + void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, \ + int32_t dim, const IDX_T* index, const IN_T* src, \ IN_T* output) const override { \ DimScatter##binop##Functor()( \ - ctx, input_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, input, output); \ + ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, output); \ } \ bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } @@ -70,9 +70,10 @@ namespace user_op { REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop); // ---- REGISTER INPLACE OPS ---- +// todo(zzk): maybe error here Maybe SetInplace(const user_op::InferContext&, user_op::AddInplaceArgPair AddInplaceArgPairFn) { - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "src", 0, true)); + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "input", 0, true)); return Maybe::Ok(); } @@ -113,9 +114,9 @@ class DimScatterBaseKernel : public user_op::OpKernel { public: DimScatterBaseKernel() = default; ~DimScatterBaseKernel() override = default; - virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, + int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) const = 0; private: @@ -123,23 +124,32 @@ class DimScatterBaseKernel : public user_op::OpKernel { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); + printf("1 \n"); const int32_t dim = ctx->Attr("dim"); + printf("2 \n"); const IN_T* input = input_tensor->dptr(); const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + printf("3 \n"); Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); - if (src_tensor) { - Memcpy(ctx->device_ctx(), output, src_tensor->dptr(), out_bytes_size); + const IN_T* src = src_tensor->dptr(); + + printf("4 \n"); + + // fix bug here! + if (input_tensor) { + Memcpy(ctx->device_ctx(), output, input, out_bytes_size); } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { Error::Unimplemented(); } + printf("5 \n"); int ndim = input_tensor->shape().NumAxes(); fixed_vector shape_vec(ndim); @@ -147,13 +157,22 @@ class DimScatterBaseKernel : public user_op::OpKernel { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int64_t dim) -> IDX_T { return static_cast(dim); }); }; - shape2dims(input_tensor->shape()); - DimOpIndexNdHelper input_nd_helper(shape_vec.data(), ndim); + printf("6 \n"); + + shape2dims(src_tensor->shape()); + DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); + printf("7 \n"); + + shape2dims(index_tensor->shape()); + DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); + printf("8 \n"); + shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + printf("9 \n"); - BinaryOp(ctx->device_ctx(), input_nd_helper, output_nd_helper, ndim, - input_tensor->shape().elem_cnt(), dim, index, input, output); + BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 873ca6ed8c9..86b7620f336 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -61,9 +61,8 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { // } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - *out->mut_shape() = src ? src->shape() : like->shape(); - *out->mut_data_type() = input->data_type(); - + // *out->mut_shape() = src ? src->shape() : like->shape(); + *out->mut_shape() = input ? input->shape() : like->shape(); return Maybe::Ok(); } @@ -114,9 +113,7 @@ Maybe SetSbpInplace(user_op::SbpContext* ctx) { Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); - const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); - user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); - *out->mut_data_type() = in->data_type(); + *ctx->OutputDType("output", 0) = ctx->InputDType("input", 0); return Maybe::Ok(); } } // namespace @@ -133,6 +130,18 @@ Maybe InferDtype(user_op::InferContext* ctx) { .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpLike) +// #define REGISTER_SCATTER_INPLACE_OP(optypename) \ +// REGISTER_USER_OP(optypename) \ +// .OptionalInput("src") \ +// .Input("input") \ +// .Input("index") \ +// .Output("output") \ +// .Attr("dim") \ +// .SetTensorDescInferFn(InferTensorDesc) \ +// .SetInputArgModifyFn(InplaceInputArgModifierFn) \ +// .SetDataTypeInferFn(InferDtype) \ +// .SetGetSbpFn(SetSbpInplace) + #define REGISTER_SCATTER_INPLACE_OP(optypename) \ REGISTER_USER_OP(optypename) \ .OptionalInput("src") \ @@ -141,7 +150,6 @@ Maybe InferDtype(user_op::InferContext* ctx) { .Output("output") \ .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ - .SetInputArgModifyFn(InplaceInputArgModifierFn) \ .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpInplace) From 5915d128ec57e955dbf9ed8a69fda5a3dca4988a Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 5 Jul 2021 20:50:39 +0800 Subject: [PATCH 32/82] remove inplace proposal and fix macro name --- .../user/kernels/dim_scatter_kernel_util.h | 34 ------ oneflow/user/kernels/dim_scatter_kernels.cpp | 103 +++++++----------- oneflow/user/ops/dim_scatter_ops.cpp | 24 +--- 3 files changed, 46 insertions(+), 115 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 1c2e31344a3..ff8bbf37d6d 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -45,20 +45,6 @@ namespace user_op { DECLARE_DIMSCATTER_FUNCTOR(Add); DECLARE_DIMSCATTER_FUNCTOR(Update); -// template -// OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, -// const DimOpIndexNdHelper& output_nd_helper, int ndim, -// int64_t elem_cnt, int32_t dim, const IDX_T* index, -// const IN_T* src, IN_T* output, BinaryOpFn bin_op) { -// XPU_1D_KERNEL_LOOP(src_offset, elem_cnt) { -// IDX_T coordinate[kDimGatherMaxDimCount] = {0}; -// src_nd_helper.OffsetToNdIndex(src_offset, coordinate, ndim); -// coordinate[dim] = index[src_offset]; - -// IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); -// bin_op(src + src_offset, output + output_offset); -// } -// } template OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, @@ -67,32 +53,12 @@ OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_he int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { - // 感觉需要从index_offset算src_offset - // 是不是还需要一个idx_nd_helper? - - // an example IDX_T coordinate[kDimGatherMaxDimCount] = {0}; idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); - printf("idx offset is: %d \n", idx_offset); IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); - printf("src offset is: %d \n", src_offset); - coordinate[dim] = index[idx_offset]; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - printf("output offset is: %d \n", output_offset); - - printf("src is: %f \n", *(src+src_offset)); - printf("output is: %f \n", *(output+output_offset)); - bin_op(src + src_offset, output + output_offset); - // ======= finish ======== - - // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - // src_nd_helper.OffsetToNdIndex(src_offset, coordinate, ndim); - // coordinate[dim] = index[idx_offset]; - - // IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - // bin_op(src + src_offset, output + output_offset); } } diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index c2e5b8787d7..9f3949d9b3d 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -38,76 +38,68 @@ namespace user_op { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } -#define REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_SCATTER_LIKE_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) +#define REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int64_t, optypename, binop) -#define REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); +#define REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_OUT_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_OUTPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#define REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else -#define REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop) +#define REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_SCATTER_OUTPLACE_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_OUTPLACE_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_SCATTER_OUTPLACE_GPUKERNELS(optypename, binop); +#define REGISTER_SCATTER_LIKE_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop); -// ---- REGISTER INPLACE OPS ---- -// todo(zzk): maybe error here -Maybe SetInplace(const user_op::InferContext&, - user_op::AddInplaceArgPair AddInplaceArgPairFn) { - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("output", 0, "input", 0, true)); - return Maybe::Ok(); -} -#define REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, dtype, itype, optypename, binop) \ +#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ REGISTER_USER_KERNEL(optypename) \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)) \ - .SetInplaceProposalFn(SetInplace); -#define REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(device, int32_t, int64_t, optypename, binop) +#define REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t, optypename, binop); -#define REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); +#define REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); #ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_IN_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_INPLACE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); +#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) \ + REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else -#define REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop) +#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_SCATTER_INTPLACE_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_INPLACE_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_SCATTER_INPLACE_GPUKERNELS(optypename, binop); +#define REGISTER_SCATTER_KERNEL(optypename, binop) \ + REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop); \ + REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop); template class DimScatterBaseKernel : public user_op::OpKernel { @@ -125,23 +117,17 @@ class DimScatterBaseKernel : public user_op::OpKernel { const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); - printf("1 \n"); const int32_t dim = ctx->Attr("dim"); - printf("2 \n"); const IN_T* input = input_tensor->dptr(); const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - printf("3 \n"); Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); const IN_T* src = src_tensor->dptr(); - printf("4 \n"); - - // fix bug here! if (input_tensor) { Memcpy(ctx->device_ctx(), output, input, out_bytes_size); } else if (like_tensor) { @@ -149,7 +135,6 @@ class DimScatterBaseKernel : public user_op::OpKernel { } else { Error::Unimplemented(); } - printf("5 \n"); int ndim = input_tensor->shape().NumAxes(); fixed_vector shape_vec(ndim); @@ -157,20 +142,12 @@ class DimScatterBaseKernel : public user_op::OpKernel { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int64_t dim) -> IDX_T { return static_cast(dim); }); }; - printf("6 \n"); - shape2dims(src_tensor->shape()); DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); - printf("7 \n"); - shape2dims(index_tensor->shape()); DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - printf("8 \n"); - shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - printf("9 \n"); - BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), dim, index, src, output); } @@ -180,10 +157,10 @@ class DimScatterBaseKernel : public user_op::OpKernel { IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); -REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_add_like", Add); -REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_update_like", Update); -REGISTER_SCATTER_INTPLACE_KERNEL("dim_scatter_add", Add); -REGISTER_SCATTER_INTPLACE_KERNEL("dim_scatter_update", Update); +REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_add_like", Add); +REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_update_like", Update); +REGISTER_SCATTER_KERNEL("dim_scatter_add", Add); +REGISTER_SCATTER_KERNEL("dim_scatter_update", Update); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 86b7620f336..21b646426af 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -61,7 +61,6 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { // } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); - // *out->mut_shape() = src ? src->shape() : like->shape(); *out->mut_shape() = input ? input->shape() : like->shape(); return Maybe::Ok(); } @@ -105,7 +104,7 @@ Maybe SetSbpLike(user_op::SbpContext* ctx) { return Maybe::Ok(); } -Maybe SetSbpInplace(user_op::SbpContext* ctx) { +Maybe SetSbpScatter(user_op::SbpContext* ctx) { _SetSbp(ctx, "src"); return Maybe::Ok(); } @@ -130,19 +129,8 @@ Maybe InferDtype(user_op::InferContext* ctx) { .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpLike) -// #define REGISTER_SCATTER_INPLACE_OP(optypename) \ -// REGISTER_USER_OP(optypename) \ -// .OptionalInput("src") \ -// .Input("input") \ -// .Input("index") \ -// .Output("output") \ -// .Attr("dim") \ -// .SetTensorDescInferFn(InferTensorDesc) \ -// .SetInputArgModifyFn(InplaceInputArgModifierFn) \ -// .SetDataTypeInferFn(InferDtype) \ -// .SetGetSbpFn(SetSbpInplace) - -#define REGISTER_SCATTER_INPLACE_OP(optypename) \ + +#define REGISTER_SCATTER_OP(optypename) \ REGISTER_USER_OP(optypename) \ .OptionalInput("src") \ .Input("input") \ @@ -151,7 +139,7 @@ Maybe InferDtype(user_op::InferContext* ctx) { .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetDataTypeInferFn(InferDtype) \ - .SetGetSbpFn(SetSbpInplace) + .SetGetSbpFn(SetSbpScatter) #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ @@ -173,8 +161,8 @@ Maybe InferDtype(user_op::InferContext* ctx) { REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); -REGISTER_SCATTER_INPLACE_OP("dim_scatter_add"); -REGISTER_SCATTER_INPLACE_OP("dim_scatter_update"); +REGISTER_SCATTER_OP("dim_scatter_add"); +REGISTER_SCATTER_OP("dim_scatter_update"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add_like"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update_like"); From 327944fa6bc05af2342ce444c6d20f2e96fffc5e Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 6 Jul 2021 11:05:35 +0800 Subject: [PATCH 33/82] remove outdated atomic add --- oneflow/user/kernels/dim_gather_scatter_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index e0185da3cf1..254edaecbc7 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -42,7 +42,6 @@ template struct DeviceBinOp { OF_DEVICE_FUNC static void Add(const T* x, T* y) { #ifdef __CUDA_ARCH__ - // gpu_atomic_add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half cuda::atomic::Add(y, *x); #else *y += *x; From ae614629325e55973c6d7c2755dfaea45fc45463 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 6 Jul 2021 11:05:52 +0800 Subject: [PATCH 34/82] move sbp infer --- oneflow/user/ops/dim_scatter_ops.cpp | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 21b646426af..ae71f96fed2 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -32,12 +32,6 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int32_t dim = ctx->Attr("dim"); - // const cfg::SbpParallel& input_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - // int64_t split_axis = input_sbp.split_parallel().axis(); - // if (ctx->parallel_ctx().parallel_num() != 1 && input_sbp.has_split_parallel()) { - // CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - // } - int64_t input_num_axes = input->shape().NumAxes(); CHECK_GT_OR_RETURN(input_num_axes, 0); CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); @@ -109,6 +103,16 @@ Maybe SetSbpScatter(user_op::SbpContext* ctx) { return Maybe::Ok(); } +Maybe SetSbpInfer(user_op::InferSbpSignatureFnContext* ctx) { + int32_t dim = ctx->Attr("dim"); + const cfg::SbpParallel input_sbp = ctx->SbpParallelHint4InputArgNameAndIndex("input", 0); + int64_t split_axis = input_sbp.split_parallel().axis(); + if (ctx->parallel_num() != 1 && input_sbp.has_split_parallel()) { + CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; + } + return Maybe::Ok(); +} + Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); @@ -127,7 +131,8 @@ Maybe InferDtype(user_op::InferContext* ctx) { .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ .SetDataTypeInferFn(InferDtype) \ - .SetGetSbpFn(SetSbpLike) + .SetGetSbpFn(SetSbpLike) \ + .SetSbpSignatureInferFn(SetSbpInfer) #define REGISTER_SCATTER_OP(optypename) \ @@ -139,7 +144,9 @@ Maybe InferDtype(user_op::InferContext* ctx) { .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetDataTypeInferFn(InferDtype) \ - .SetGetSbpFn(SetSbpScatter) + .SetGetSbpFn(SetSbpScatter) \ + .SetSbpSignatureInferFn(SetSbpInfer) + #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ From cde37c008d92f1575a545f37a7b05ad2a3905b43 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 6 Jul 2021 18:02:27 +0800 Subject: [PATCH 35/82] add const and throw error --- .../user/kernels/dim_gather_scatter_util.h | 26 +++++++++---------- .../user/kernels/dim_scatter_kernel_util.h | 19 ++++++++++---- oneflow/user/kernels/dim_scatter_kernels.cpp | 20 ++++++++------ oneflow/user/ops/dim_scatter_ops.cpp | 24 ++++++++++++----- 4 files changed, 56 insertions(+), 33 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 254edaecbc7..a4fe96b17ba 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -56,17 +56,17 @@ struct DeviceBinOp { template \ struct DimScatter##binop##Functor final { \ void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ + const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output); \ } #define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ template \ struct DimScatter##binop##Functor final { \ void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ + const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output) { \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, \ index, src, output, DeviceBinOp::binop); \ } \ } @@ -76,28 +76,28 @@ struct DeviceBinOp { __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ const DimOpIndexNdHelper idx_nd_helper, \ const DimOpIndexNdHelper output_nd_helper, \ - int ndim, int64_t elem_cnt, int32_t dim, \ + const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, \ src, output, DeviceBinOp::binop); \ } \ template \ struct DimScatter##binop##Functor final { \ void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output) { \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ + const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src, \ output); \ } \ }; \ template \ struct DimScatter##binop##Functor final { \ void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const float16* src, float16* output) { \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ + const int32_t dim, const int64_t upper_bound, const IDX_T* index, const float16* src, float16* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, \ reinterpret_cast(src), reinterpret_cast(output)); \ } \ } diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index ff8bbf37d6d..0ce3bf0e64e 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #include "oneflow/user/kernels/dim_gather_scatter_util.h" +#include "oneflow/core/common/error.pb.h" + // Steps for adding a binary operation on scatter are as follows: // 1. implment binop in DeviceBinOp, for example "Mul": // OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } @@ -45,17 +47,24 @@ namespace user_op { DECLARE_DIMSCATTER_FUNCTOR(Add); DECLARE_DIMSCATTER_FUNCTOR(Update); - template OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); - IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + if(idx_offset>upper_bound){ + #if __CUDA_ARCH__ + __trap(); + #else + std::cout<<"The index element "<& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* src, \ + int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, \ IN_T* output) const override { \ DimScatter##binop##Functor()( \ - ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, index, src, output); \ + ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src, output); \ } \ bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } @@ -107,8 +107,8 @@ class DimScatterBaseKernel : public user_op::OpKernel { DimScatterBaseKernel() = default; ~DimScatterBaseKernel() override = default; virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* src, + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output) const = 0; private: @@ -133,14 +133,15 @@ class DimScatterBaseKernel : public user_op::OpKernel { } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { - Error::Unimplemented(); + std::cout<<"Unimplemented Error"<shape().NumAxes(); + const int ndim = input_tensor->shape().NumAxes(); fixed_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int64_t dim) -> IDX_T { return static_cast(dim); }); + [](int32_t dim) -> IDX_T { return static_cast(dim); }); }; shape2dims(src_tensor->shape()); DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); @@ -148,8 +149,11 @@ class DimScatterBaseKernel : public user_op::OpKernel { DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + const int64_t upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, index, src, output); + index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index ae71f96fed2..22809d26309 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -32,13 +32,12 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int32_t dim = ctx->Attr("dim"); + // check input.numaxes == index.numaxes == src/like.num_axes int64_t input_num_axes = input->shape().NumAxes(); CHECK_GT_OR_RETURN(input_num_axes, 0); CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); - int64_t index_num_axes = index->shape().NumAxes(); CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - int64_t output_num_axes = 0; if (src) { output_num_axes = src->shape().NumAxes(); @@ -49,10 +48,22 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { } CHECK_EQ_OR_RETURN(input_num_axes, output_num_axes); - // todo(zzk): it is not align with torch - // FOR_RANGE(int64_t, i, 0, input_num_axes) { - // CHECK_EQ_OR_RETURN(index->shape().At(i), input->shape().At(i)); - // } + // check index.shape(i) <= input.shape(i) + FOR_RANGE(int64_t, i, 0, input_num_axes) { + if(i==dim) continue; + CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); + } + + // check index.shape(i) <= src/like.shape(i) + FOR_RANGE(int64_t, i, 0, input_num_axes) { + if(i==dim) continue; + if(src){ + CHECK_LE_OR_RETURN(index->shape().At(i), src->shape().At(i)); + } + else{ + CHECK_LE_OR_RETURN(index->shape().At(i), like->shape().At(i)); + } + } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); @@ -63,7 +74,6 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); - // like_arg_modifier->set_use_header_only(true); like_arg_modifier->set_requires_grad(false); user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); From 98e0f704ae53df0d00e056685bed715a54a2669d Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Wed, 7 Jul 2021 16:49:12 +0800 Subject: [PATCH 36/82] add check --- .../user/kernels/dim_gather_scatter_util.h | 2 +- .../user/kernels/dim_scatter_kernel_util.h | 9 +- oneflow/user/kernels/dim_scatter_kernels.cpp | 15 ++- oneflow/user/ops/dim_gather_op.cpp | 14 --- oneflow/user/ops/dim_scatter_ops.cpp | 113 ++++++++++++------ 5 files changed, 91 insertions(+), 62 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index a4fe96b17ba..ddb3fd27f7d 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -76,7 +76,7 @@ struct DeviceBinOp { __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ const DimOpIndexNdHelper idx_nd_helper, \ const DimOpIndexNdHelper output_nd_helper, \ - const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ + const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ const IDX_T* index, const IN_T* src, IN_T* output) { \ DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, \ src, output, DeviceBinOp::binop); \ diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 0ce3bf0e64e..90510e636cf 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -51,21 +51,22 @@ template OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, const IDX_T* index, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk - if(idx_offset>upper_bound){ + IDX_T idx_elem = index[idx_offset]; + if(idx_elem>=upper_bound){ #if __CUDA_ARCH__ __trap(); #else - std::cout<<"The index element "<>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); #define REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(device, optypename, binop) \ @@ -119,7 +119,6 @@ class DimScatterBaseKernel : public user_op::OpKernel { const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); const int32_t dim = ctx->Attr("dim"); - const IN_T* input = input_tensor->dptr(); const IDX_T* index = index_tensor->dptr(); IN_T* output = out_tensor->mut_dptr(); size_t out_bytes_size = @@ -129,7 +128,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { const IN_T* src = src_tensor->dptr(); if (input_tensor) { - Memcpy(ctx->device_ctx(), output, input, out_bytes_size); + Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { @@ -137,7 +136,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { throw Error::Unimplemented(); } - const int ndim = input_tensor->shape().NumAxes(); + const int ndim = src_tensor->shape().NumAxes(); fixed_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), @@ -150,7 +149,13 @@ class DimScatterBaseKernel : public user_op::OpKernel { shape2dims(out_tensor->shape()); DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - const int64_t upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + int64_t upper_bound = 0; + if(input_tensor){ + upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + } + else{ + upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + } BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index af22a182841..a10f38dfdc3 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -35,23 +35,9 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_LT_OR_RETURN(dim, input_num_axes); CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - // split_axs should NOT equals dim when in consistent view - // const cfg::SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("input", 0); - // auto is_split = in_sbp.has_split_parallel(); - // if (ctx->parallel_ctx().parallel_num() != 1 && is_split) { - // int64_t split_axis = in_sbp.split_parallel().axis(); - // CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - // } - CHECK_OR_RETURN(!in->is_dynamic()); CHECK_OR_RETURN(!index->is_dynamic()); - // for scatter backword, this check moved to python - // FOR_RANGE(int64_t, i, 0, input_num_axes) { - // if (i == dim) { continue; } - // CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); - // } - user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = index->shape(); *out->mut_data_type() = in->data_type(); diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 22809d26309..cd731f9b7e1 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -32,46 +32,49 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { int32_t dim = ctx->Attr("dim"); - // check input.numaxes == index.numaxes == src/like.num_axes - int64_t input_num_axes = input->shape().NumAxes(); - CHECK_GT_OR_RETURN(input_num_axes, 0); - CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); + // check index.numaxes == src.num_axes == input/like.numaxes + int64_t src_num_axes = src->shape().NumAxes(); + CHECK_GT_OR_RETURN(src_num_axes, 0); + CHECK_LE_OR_RETURN(src_num_axes, kDimGatherMaxDimCount); int64_t index_num_axes = index->shape().NumAxes(); - CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - int64_t output_num_axes = 0; - if (src) { - output_num_axes = src->shape().NumAxes(); + CHECK_EQ_OR_RETURN(src_num_axes, index_num_axes); + + int64_t output_num_axes = 0; + if (input) { + output_num_axes = input->shape().NumAxes(); } else if (like) { output_num_axes = like->shape().NumAxes(); } else { Error::Unimplemented(); } - CHECK_EQ_OR_RETURN(input_num_axes, output_num_axes); - - // check index.shape(i) <= input.shape(i) - FOR_RANGE(int64_t, i, 0, input_num_axes) { - if(i==dim) continue; - CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); - } + CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); - // check index.shape(i) <= src/like.shape(i) - FOR_RANGE(int64_t, i, 0, input_num_axes) { + // check index.shape(i) <= input/like.shape(i) + FOR_RANGE(int64_t, i, 0, index_num_axes) { if(i==dim) continue; - if(src){ - CHECK_LE_OR_RETURN(index->shape().At(i), src->shape().At(i)); + if(input){ + CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); } else{ CHECK_LE_OR_RETURN(index->shape().At(i), like->shape().At(i)); } } + + // check index.shape(i) <= src.shape(i) + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if(i==dim) continue; + CHECK_LE_OR_RETURN(index->shape().At(i), src->shape().At(i)); + } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); + printf("infertensor ok"); return Maybe::Ok(); } Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { + // is there a problem? user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); CHECK(like_arg_modifier != nullptr); like_arg_modifier->set_requires_grad(false); @@ -95,6 +98,36 @@ Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgMo } void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { + const user_op::TensorDesc& index_tensor = + ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); + int64_t index_num_axes = index_tensor.shape().NumAxes(); + const int32_t dim = ctx->Attr("dim"); + + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if (i != dim) { + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Split(user_op::OpArg(like_or_src, 0), i) + .Build(); + } else { + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .PartialSum(user_op::OpArg("output", 0)) + .Broadcast(user_op::OpArg(like_or_src, 0)) + .Build(); + + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .PartialSum(user_op::OpArg("output", 0)) + .PartialSum(user_op::OpArg(like_or_src, 0)) + .Build(); + } + } + ctx->NewBuilder() .PartialSum(user_op::OpArg("input", 0)) .Broadcast(user_op::OpArg("index", 0)) @@ -113,20 +146,28 @@ Maybe SetSbpScatter(user_op::SbpContext* ctx) { return Maybe::Ok(); } -Maybe SetSbpInfer(user_op::InferSbpSignatureFnContext* ctx) { - int32_t dim = ctx->Attr("dim"); - const cfg::SbpParallel input_sbp = ctx->SbpParallelHint4InputArgNameAndIndex("input", 0); - int64_t split_axis = input_sbp.split_parallel().axis(); - if (ctx->parallel_num() != 1 && input_sbp.has_split_parallel()) { - CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; - } - return Maybe::Ok(); -} +// Maybe SetSbpInfer(user_op::InferSbpSignatureFnContext* ctx) { +// int32_t dim = ctx->Attr("dim"); +// const cfg::SbpParallel input_sbp = ctx->SbpParallelHint4InputArgNameAndIndex("input", 0); +// int64_t split_axis = input_sbp.split_parallel().axis(); +// if (ctx->parallel_num() != 1 && input_sbp.has_split_parallel()) { +// CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; +// } +// return Maybe::Ok(); +// } Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); - *ctx->OutputDType("output", 0) = ctx->InputDType("input", 0); + const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); + const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); // can be deleted + if(input){ + CHECK_EQ_OR_RETURN(ctx->InputDType("input", 0), ctx->InputDType("src", 0)); + } + else{ + CHECK_EQ_OR_RETURN(ctx->InputDType("like", 0), ctx->InputDType("src", 0)); + } + *ctx->OutputDType("output", 0) = ctx->InputDType("src", 0); return Maybe::Ok(); } } // namespace @@ -134,28 +175,26 @@ Maybe InferDtype(user_op::InferContext* ctx) { #define REGISTER_SCATTER_LIKE_OP(optypename) \ REGISTER_USER_OP(optypename) \ .Input("like") \ - .Input("input") \ .Input("index") \ + .Input("src") \ .Output("output") \ .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ .SetDataTypeInferFn(InferDtype) \ - .SetGetSbpFn(SetSbpLike) \ - .SetSbpSignatureInferFn(SetSbpInfer) + .SetGetSbpFn(SetSbpLike) #define REGISTER_SCATTER_OP(optypename) \ REGISTER_USER_OP(optypename) \ - .OptionalInput("src") \ .Input("input") \ - .Input("index") \ + .Input("index") \ + .Input("src") \ .Output("output") \ .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetDataTypeInferFn(InferDtype) \ - .SetGetSbpFn(SetSbpScatter) \ - .SetSbpSignatureInferFn(SetSbpInfer) + .SetGetSbpFn(SetSbpScatter) #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ @@ -181,8 +220,6 @@ REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_OP("dim_scatter_add"); REGISTER_SCATTER_OP("dim_scatter_update"); -REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add_like"); -REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update_like"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); From 472040f17333ceae69bab7fd58259b6bf101c1d5 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Wed, 7 Jul 2021 17:52:54 +0800 Subject: [PATCH 37/82] set grad op --- oneflow/user/ops/dim_scatter_ops.cpp | 38 +++++++++++++--------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index cd731f9b7e1..0cfb16c508d 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -68,32 +68,29 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); - printf("infertensor ok"); + // printf("infertensor ok"); return Maybe::Ok(); } Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { // is there a problem? - user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); - CHECK(like_arg_modifier != nullptr); - like_arg_modifier->set_requires_grad(false); + // user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); + // CHECK(like_arg_modifier != nullptr); + // like_arg_modifier->set_requires_grad(false); - user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); - CHECK(indices_modifier != nullptr); - indices_modifier->set_requires_grad(false); - return Maybe::Ok(); -} + // user_op::InputArgModifier* src_arg_modifier = GetInputArgModifierFn("src", 0); + // CHECK(src_arg_modifier != nullptr); + // src_arg_modifier->set_requires_grad(false); -Maybe InplaceInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* src_arg_modifier = GetInputArgModifierFn("src", 0); - CHECK(src_arg_modifier != nullptr); - src_arg_modifier->set_requires_grad(false); + user_op::InputArgModifier* input_arg_modifier = GetInputArgModifierFn("input", 0); + CHECK(input_arg_modifier != nullptr); + input_arg_modifier->set_requires_grad(false); user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); indices_modifier->set_requires_grad(false); + return Maybe::Ok(); } @@ -193,6 +190,7 @@ Maybe InferDtype(user_op::InferContext* ctx) { .Output("output") \ .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ + .SetInputArgModifyFn(InputArgModifierFn) \ .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpScatter) @@ -200,8 +198,8 @@ Maybe InferDtype(user_op::InferContext* ctx) { #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; \ - ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + const auto op_src_grad_name = ctx->FwOp().op_name() + "_grad"; \ + ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ return builder.OpTypeName("dim_gather") \ .InputBind("index", ctx->FwOp().input("index", 0)) \ .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ @@ -209,10 +207,10 @@ Maybe InferDtype(user_op::InferContext* ctx) { .Attr("dim", ctx->FwOp().attr("dim")) \ .Build(); \ }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ - [&ctx, &op_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_grad_name).output("output", 0); \ - }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ + [&ctx, &op_src_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_src_grad_name).output("output", 0); \ + }); }); REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); From d781ff18e4e7e278fd24f424a9e9d35b82c36fde Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Thu, 8 Jul 2021 14:08:56 +0800 Subject: [PATCH 38/82] add scatter scalar --- oneflow/user/kernels/dim_scatter_scalar.cpp | 123 ++++++++++++++++++++ oneflow/user/kernels/dim_scatter_scalar.h | 91 +++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 oneflow/user/kernels/dim_scatter_scalar.cpp create mode 100644 oneflow/user/kernels/dim_scatter_scalar.h diff --git a/oneflow/user/kernels/dim_scatter_scalar.cpp b/oneflow/user/kernels/dim_scatter_scalar.cpp new file mode 100644 index 00000000000..49effa3e5ac --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_scalar.cpp @@ -0,0 +1,123 @@ +#include "oneflow/user/kernels/dim_scatter_scalar.h" + +namespace oneflow{ + +namespace user_op{ + +template \ +class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { + public: + CpuDimScatterScalarUpdateKernel() = default; + ~CpuDimScatterScalarUpdateKernel() override = default; + + private: + void Compute(KernelComputeContext* ctx) const override { + const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); + const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const int32_t dim = ctx->Attr("dim"); + + const IDX_T* index = index_tensor->dptr(); + IN_T* output = out_tensor->mut_dptr(); + size_t out_bytes_size = + out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); + const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); + + if (input_tensor) { + Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); + } else if (like_tensor) { + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } else { + std::cout<<"Unimplemented Error"<shape().NumAxes(); + fixed_vector shape_vec(ndim); + auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { + std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), + [](int32_t dim) -> IDX_T { return static_cast(dim); }); + }; + shape2dims(index_tensor->shape()); + DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); + shape2dims(out_tensor->shape()); + DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + int64_t upper_bound = input_tensor->shape().At(dim); + + ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, upper_bound, index, src_scalar, output); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template \ +class CpuDimScatterScalarAddKernel final : public user_op::OpKernel { + public: + CpuDimScatterScalarAddKernel() = default; + ~CpuDimScatterScalarAddKernel() override = default; + + private: + void Compute(KernelComputeContext* ctx) const override { + const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); + const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const int32_t dim = ctx->Attr("dim"); + + const IDX_T* index = index_tensor->dptr(); + IN_T* output = out_tensor->mut_dptr(); + size_t out_bytes_size = + out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); + const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); + + if (input_tensor) { + Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); + } else if (like_tensor) { + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } else { + std::cout<<"Unimplemented Error"<shape().NumAxes(); + fixed_vector shape_vec(ndim); + auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { + std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), + [](int32_t dim) -> IDX_T { return static_cast(dim); }); + }; + shape2dims(index_tensor->shape()); + DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); + shape2dims(out_tensor->shape()); + DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + int64_t upper_bound = input_tensor->shape().At(dim); + + ScatterScalarAddFunctor(idx_nd_helper, output_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, upper_bound, index, src_scalar, output); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("dim_scatter_scalar_add") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + + +REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); + +} // namespace user_op +} // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_scalar.h b/oneflow/user/kernels/dim_scatter_scalar.h new file mode 100644 index 00000000000..a8a72e98ed0 --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_scalar.h @@ -0,0 +1,91 @@ +#ifdef WITH_CUDA +#include "oneflow/core/cuda/atomic.cuh" +#endif // WITH_CUDA +#include "oneflow/core/ndarray/xpu_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/data_type.h" + +namespace oneflow{ + +namespace user_op{ + +constexpr int kDimGatherMaxDimCount = 8; + +template +using DimOpIndexNdHelper = NdIndexOffsetHelper; + +// template +// struct ScatterScalarAdd { +// OF_DEVICE_FUNC static void Add(const float x, T* y) { +// #ifdef __CUDA_ARCH__ +// cuda::atomic::Add(y, x); +// #else +// *y += x; +// #endif +// } +// }; + +// template +// struct ScatterScalarUpdate { +// OF_DEVICE_FUNC static void Update(const float x, T* y) { *y = x; } +// }; + +template +OF_DEVICE_FUNC void ScatterScalarUpdateFunctor(const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, + const IN_T src, IN_T* output) { + XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { + // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + IDX_T coordinate[8] = {0}; + + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + IDX_T idx_elem = index[idx_offset]; + if(idx_elem>=upper_bound){ + #if __CUDA_ARCH__ + __trap(); + #else + std::cout<<"The index element "< +OF_DEVICE_FUNC void ScatterScalarAddFunctor(const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, + const IN_T src, IN_T* output) { + XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { + // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + IDX_T coordinate[8] = {0}; + + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + IDX_T idx_elem = index[idx_offset]; + if(idx_elem>=upper_bound){ + #if __CUDA_ARCH__ + __trap(); + #else + std::cout<<"The index element "< Date: Thu, 8 Jul 2021 16:14:59 +0800 Subject: [PATCH 39/82] add scatter scalar gpu kernel --- oneflow/user/kernels/dim_scatter_scalar.cpp | 68 ++-------------- oneflow/user/kernels/dim_scatter_scalar.cu | 89 +++++++++++++++++++++ oneflow/user/kernels/dim_scatter_scalar.h | 56 ++----------- oneflow/user/ops/dim_scatter_ops.cpp | 86 ++++++++++++++++++-- 4 files changed, 183 insertions(+), 116 deletions(-) create mode 100644 oneflow/user/kernels/dim_scatter_scalar.cu diff --git a/oneflow/user/kernels/dim_scatter_scalar.cpp b/oneflow/user/kernels/dim_scatter_scalar.cpp index 49effa3e5ac..c81fb082147 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar.cpp @@ -4,11 +4,11 @@ namespace oneflow{ namespace user_op{ -template \ +template class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { public: CpuDimScatterScalarUpdateKernel() = default; - ~CpuDimScatterScalarUpdateKernel() override = default; + ~CpuDimScatterScalarUpdateKernel() = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -54,70 +54,18 @@ class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -template \ -class CpuDimScatterScalarAddKernel final : public user_op::OpKernel { - public: - CpuDimScatterScalarAddKernel() = default; - ~CpuDimScatterScalarAddKernel() override = default; - - private: - void Compute(KernelComputeContext* ctx) const override { - const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); - const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); - Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); - const int32_t dim = ctx->Attr("dim"); - - const IDX_T* index = index_tensor->dptr(); - IN_T* output = out_tensor->mut_dptr(); - size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - - Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); - - if (input_tensor) { - Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); - } else if (like_tensor) { - Memset(ctx->device_ctx(), output, 0, out_bytes_size); - } else { - std::cout<<"Unimplemented Error"<shape().NumAxes(); - fixed_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(index_tensor->shape()); - DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); - DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - - int64_t upper_bound = input_tensor->shape().At(dim); - - ScatterScalarAddFunctor(idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, index, src_scalar, output); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - #define REGISTER_CPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("dim_scatter_scalar_add") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - + & (user_op::HobDataType("index", 0) == GetDataType::value)); REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int64_t); + } // namespace user_op -} // namespace oneflow \ No newline at end of file +} // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar.cu b/oneflow/user/kernels/dim_scatter_scalar.cu new file mode 100644 index 00000000000..6d02fe4d3f7 --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_scalar.cu @@ -0,0 +1,89 @@ +#ifdef WITH_CUDA +#include "oneflow/user/kernels/dim_scatter_scalar.h" + +namespace oneflow{ + +namespace user_op{ + +namespace{ + +template +__global__ void DoCUDADimScatterScalarUpdate(const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, + const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T src_scalar, IN_T* output) { + ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src_scalar, output); + } +} // namespace + +template +class GpuDimScatterScalarUpdateKernel final : public OpKernel { + public: + GpuDimScatterScalarUpdateKernel() = default; + ~GpuDimScatterScalarUpdateKernel() = default; + + private: + void Compute(KernelComputeContext* ctx) const override { + const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); + const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const int32_t dim = ctx->Attr("dim"); + + const IDX_T* index = index_tensor->dptr(); + IN_T* output = out_tensor->mut_dptr(); + size_t out_bytes_size = + out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); + const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); + + if (input_tensor) { + Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); + } else if (like_tensor) { + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } else { + std::cout<<"Unimplemented Error"<shape().NumAxes(); + fixed_vector shape_vec(ndim); + auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { + std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), + [](int32_t dim) -> IDX_T { return static_cast(dim); }); + }; + shape2dims(index_tensor->shape()); + DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); + shape2dims(out_tensor->shape()); + DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + int64_t upper_bound = input_tensor->shape().At(dim); + int64_t elem_cnt = index_tensor->shape().elem_cnt(); + + RUN_CUDA_KERNEL((DoCUDADimScatterScalarUpdate), ctx->device_ctx(), BlocksNum4ThreadsNum(elem_cnt), + idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, + src_scalar, output); + + // RUN_CUDA_KERNEL((ScatterScalarUpdateFunctor), ctx, BlocksNum4ThreadsNum(elem_cnt), + // idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, + // src_scalar, output); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_GPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); + + +} // namespace user_op +} // namespace oneflow +#endif diff --git a/oneflow/user/kernels/dim_scatter_scalar.h b/oneflow/user/kernels/dim_scatter_scalar.h index a8a72e98ed0..e990e417d2a 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.h +++ b/oneflow/user/kernels/dim_scatter_scalar.h @@ -1,6 +1,6 @@ -#ifdef WITH_CUDA -#include "oneflow/core/cuda/atomic.cuh" -#endif // WITH_CUDA +#ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ +#define ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ +#include "oneflow/core/device/device_context.h" #include "oneflow/core/ndarray/xpu_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/framework/framework.h" @@ -15,30 +15,13 @@ constexpr int kDimGatherMaxDimCount = 8; template using DimOpIndexNdHelper = NdIndexOffsetHelper; -// template -// struct ScatterScalarAdd { -// OF_DEVICE_FUNC static void Add(const float x, T* y) { -// #ifdef __CUDA_ARCH__ -// cuda::atomic::Add(y, x); -// #else -// *y += x; -// #endif -// } -// }; - -// template -// struct ScatterScalarUpdate { -// OF_DEVICE_FUNC static void Update(const float x, T* y) { *y = x; } -// }; - template OF_DEVICE_FUNC void ScatterScalarUpdateFunctor(const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T src, IN_T* output) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { - // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - IDX_T coordinate[8] = {0}; + IDX_T coordinate[kDimGatherMaxDimCount] = {0}; idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk IDX_T idx_elem = index[idx_offset]; @@ -56,36 +39,7 @@ OF_DEVICE_FUNC void ScatterScalarUpdateFunctor(const DimOpIndexNdHelper& } } - -template -OF_DEVICE_FUNC void ScatterScalarAddFunctor(const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, - const IN_T src, IN_T* output) { - XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { - // IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - IDX_T coordinate[8] = {0}; - - idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk - IDX_T idx_elem = index[idx_offset]; - if(idx_elem>=upper_bound){ - #if __CUDA_ARCH__ - __trap(); - #else - std::cout<<"The index element "< InferTensorDesc(user_op::InferContext* ctx) { } else if (like) { output_num_axes = like->shape().NumAxes(); } else { - Error::Unimplemented(); + throw Error::Unimplemented(); } CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); @@ -54,17 +54,17 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { if(i==dim) continue; if(input){ CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); - } + } else{ CHECK_LE_OR_RETURN(index->shape().At(i), like->shape().At(i)); + } } - } // check index.shape(i) <= src.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { if(i==dim) continue; CHECK_LE_OR_RETURN(index->shape().At(i), src->shape().At(i)); - } + } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); @@ -72,6 +72,30 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { return Maybe::Ok(); } +Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { + const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + + float src_scalar = ctx->Attr("src_scalar"); + int32_t dim = ctx->Attr("dim"); + + // check index.numaxes == src.num_axes == input/like.numaxes + int64_t output_num_axes = input->shape().NumAxes(); + int64_t index_num_axes = index->shape().NumAxes(); + CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); + + // check index.shape(i) <= input/like.shape(i) + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if(i==dim) continue; + CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); + } + + user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); + *out->mut_shape() = input->shape(); + printf("infer scalar tensor ok"); + return Maybe::Ok(); +} + Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { // is there a problem? @@ -83,6 +107,7 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF // CHECK(src_arg_modifier != nullptr); // src_arg_modifier->set_requires_grad(false); + // should be deleted user_op::InputArgModifier* input_arg_modifier = GetInputArgModifierFn("input", 0); CHECK(input_arg_modifier != nullptr); input_arg_modifier->set_requires_grad(false); @@ -94,6 +119,15 @@ Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierF return Maybe::Ok(); } +Maybe InputScalarArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); + CHECK(indices_modifier != nullptr); + indices_modifier->set_requires_grad(false); + + return Maybe::Ok(); +} + void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); @@ -167,6 +201,14 @@ Maybe InferDtype(user_op::InferContext* ctx) { *ctx->OutputDType("output", 0) = ctx->InputDType("src", 0); return Maybe::Ok(); } + +Maybe InferScalarDtype(user_op::InferContext* ctx) { + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + *ctx->OutputDType("output", 0) = ctx->InputDType("input", 0); + return Maybe::Ok(); +} + } // namespace #define REGISTER_SCATTER_LIKE_OP(optypename) \ @@ -194,6 +236,17 @@ Maybe InferDtype(user_op::InferContext* ctx) { .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpScatter) +#define REGISTER_SCATTER_SCALAR_OP(optypename) \ + REGISTER_USER_OP(optypename) \ + .Input("input") \ + .Input("index") \ + .Attr("src_scalar") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferScalarTensorDesc) \ + .SetInputArgModifyFn(InputScalarArgModifierFn) \ + .SetDataTypeInferFn(InferScalarDtype) \ + .SetGetSbpFn(SetSbpScatter) #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ @@ -210,7 +263,26 @@ Maybe InferDtype(user_op::InferContext* ctx) { ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ [&ctx, &op_src_grad_name]() -> const std::string& { \ return ctx->GetOp(op_src_grad_name).output("output", 0); \ - }); + }); \ + }); + +#define REGISTER_USER_OP_GRAD_SCATTER_SCALAR(optypename) \ + REGISTER_USER_OP_GRAD(optypename) \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_grad"; \ + ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_scatter_scalar_update") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Attr("src_scalar", static_cast(0.0)) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + [&ctx, &op_input_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_input_grad_name).output("output", 0); \ + }); \ }); REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); @@ -218,8 +290,12 @@ REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_OP("dim_scatter_add"); REGISTER_SCATTER_OP("dim_scatter_update"); +REGISTER_SCATTER_SCALAR_OP("dim_scatter_scalar_update"); +REGISTER_SCATTER_SCALAR_OP("dim_scatter_scalar_add"); + REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); +REGISTER_USER_OP_GRAD_SCATTER_SCALAR("dim_scatter_scalar_update"); } // namespace user_op } // namespace oneflow \ No newline at end of file From 3610e45cfaf8b477e4c45a5c1215e21df698be30 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Thu, 8 Jul 2021 16:59:38 +0800 Subject: [PATCH 40/82] add torch style backprop --- oneflow/user/ops/dim_scatter_ops.cpp | 35 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 3c9ab689556..5715e47a8fb 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -76,7 +76,6 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - float src_scalar = ctx->Attr("src_scalar"); int32_t dim = ctx->Attr("dim"); // check index.numaxes == src.num_axes == input/like.numaxes @@ -98,19 +97,6 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { - // is there a problem? - // user_op::InputArgModifier* like_arg_modifier = GetInputArgModifierFn("like", 0); - // CHECK(like_arg_modifier != nullptr); - // like_arg_modifier->set_requires_grad(false); - - // user_op::InputArgModifier* src_arg_modifier = GetInputArgModifierFn("src", 0); - // CHECK(src_arg_modifier != nullptr); - // src_arg_modifier->set_requires_grad(false); - - // should be deleted - user_op::InputArgModifier* input_arg_modifier = GetInputArgModifierFn("input", 0); - CHECK(input_arg_modifier != nullptr); - input_arg_modifier->set_requires_grad(false); user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); @@ -191,7 +177,6 @@ Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); - const TensorDesc* like = ctx->TensorDesc4ArgNameAndIndex("like", 0); // can be deleted if(input){ CHECK_EQ_OR_RETURN(ctx->InputDType("input", 0), ctx->InputDType("src", 0)); } @@ -250,8 +235,8 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ - .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const auto op_src_grad_name = ctx->FwOp().op_name() + "_grad"; \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ + const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ return builder.OpTypeName("dim_gather") \ .InputBind("index", ctx->FwOp().input("index", 0)) \ @@ -264,12 +249,26 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { [&ctx, &op_src_grad_name]() -> const std::string& { \ return ctx->GetOp(op_src_grad_name).output("output", 0); \ }); \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ + ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_scatter_scalar_update") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Attr("src_scalar", static_cast(0.0)) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + [&ctx, &op_input_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_input_grad_name).output("output", 0); \ + }); \ }); #define REGISTER_USER_OP_GRAD_SCATTER_SCALAR(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const auto op_input_grad_name = ctx->FwOp().op_name() + "_grad"; \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ return builder.OpTypeName("dim_scatter_scalar_update") \ .InputBind("index", ctx->FwOp().input("index", 0)) \ From 22e16e7aaa350da39de58680923c5e76943ff0b5 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Thu, 8 Jul 2021 19:58:41 +0800 Subject: [PATCH 41/82] add torch style backprop check --- oneflow/user/ops/dim_scatter_ops.cpp | 74 +++++++++++++++++----------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 5715e47a8fb..738806e0350 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -89,9 +89,8 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); } - user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); + TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input->shape(); - printf("infer scalar tensor ok"); return Maybe::Ok(); } @@ -236,35 +235,52 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { #define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ - ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_gather") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ - [&ctx, &op_src_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_src_grad_name).output("output", 0); \ - }); \ - const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ - ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_scatter_scalar_update") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Attr("src_scalar", static_cast(0.0)) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ - [&ctx, &op_input_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_input_grad_name).output("output", 0); \ - }); \ + const TensorDesc& src = ctx->FwOp().TensorDesc4ArgNameAndIndex("src", 0); \ + const TensorDesc& index = ctx->FwOp().TensorDesc4ArgNameAndIndex("index", 0); \ + const int64_t ndim = src.shape().NumAxes(); \ + bool backprop_flag = true; \ + FOR_RANGE(int64_t, i, 0, ndim) { \ + if(index.shape().At(i)!=src.shape().At(i)){ \ + backprop_flag = false; \ + break; \ + } \ + }\ + if(backprop_flag){ \ + const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ + ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_gather") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ + [&ctx, &op_src_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_src_grad_name).output("output", 0); \ + }); \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ + ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_scatter_scalar_update") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Attr("src_scalar", static_cast(0.0)) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + [&ctx, &op_input_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_input_grad_name).output("output", 0); \ + }); \ + } \ + else{ \ + std::cout<<"The backward pass is implemented only for src.shape == index.shape."< Date: Thu, 8 Jul 2021 20:13:10 +0800 Subject: [PATCH 42/82] align with master --- oneflow/user/ops/dim_gather_op.cpp | 70 +++++++++++++----------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index a10f38dfdc3..6128f5b8cfb 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -19,8 +19,13 @@ limitations under the License. namespace oneflow { namespace user_op { -namespace { -Maybe InferTensorDesc(user_op::InferContext* ctx) { + +REGISTER_USER_OP("dim_gather") +.Input("input") +.Input("index") +.Output("output") +.Attr("dim") +.SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); int64_t input_num_axes = in->shape().NumAxes(); CHECK_GT_OR_RETURN(input_num_axes, 0); @@ -28,33 +33,41 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); int64_t index_num_axes = index->shape().NumAxes(); - CHECK_OR_RETURN(IsIndexDataType(index->data_type())); const int32_t dim = ctx->Attr("dim"); CHECK_GE_OR_RETURN(dim, 0); CHECK_LT_OR_RETURN(dim, input_num_axes); CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - CHECK_OR_RETURN(!in->is_dynamic()); - CHECK_OR_RETURN(!index->is_dynamic()); + CHECK_EQ_OR_RETURN(in->is_dynamic(), index->is_dynamic()); - user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); + FOR_RANGE(int64_t, i, 0, input_num_axes) { + if (i == dim) { continue; } + CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); + } + + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); *out->mut_shape() = index->shape(); - *out->mut_data_type() = in->data_type(); return Maybe::Ok(); -} - -void GatherInputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { +}) +.SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe { + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); + *out->mut_data_type() = in->data_type(); + return Maybe::Ok(); +}) +.SetInputArgModifyFn([](user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); indices_modifier->set_requires_grad(false); -} - - -Maybe BuildSbp(user_op::SbpContext* ctx) { - const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); +}) +.SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { + const user_op::TensorDesc& index_tensor = + ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); int64_t index_num_axes = index_tensor.shape().NumAxes(); const int32_t dim = ctx->Attr("dim"); @@ -73,35 +86,14 @@ Maybe BuildSbp(user_op::SbpContext* ctx) { .Build(); } } - ctx->NewBuilder() .PartialSum(user_op::OpArg("input", 0)) .Broadcast(user_op::OpArg("index", 0)) .PartialSum(user_op::OpArg("output", 0)) .Build(); return Maybe::Ok(); -} - -Maybe InferDtype(user_op::InferContext* ctx) { - const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - CHECK_OR_RETURN(IsIndexDataType(index->data_type())); - const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); - user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); - *out->mut_data_type() = in->data_type(); - return Maybe::Ok(); -} - -} // namespace +}); -REGISTER_USER_OP("dim_gather") - .Input("input") - .Input("index") - .Output("output") - .Attr("dim") - .SetTensorDescInferFn(InferTensorDesc) - .SetInputArgModifyFn(GatherInputArgModifierFn) - .SetDataTypeInferFn(InferDtype) - .SetGetSbpFn(BuildSbp); REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; @@ -111,7 +103,7 @@ REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardO .OpTypeName( "dim_scatter_add_like") // dim_scatter_add_like(like, dim, index, input) -> output .InputBind("index", ctx->FwOp().input("index", 0)) // scatter.index <- gather.index - .InputBind("input", + .InputBind("src", ctx->FwOp().output_grad("output", 0)) // scatter.input <- grad of gather.out .InputBind("like", ctx->FwOp().input("input", 0)) .Output("output") From f75a5bad88451f5a3c9b9ca6a50ba85d87187b0a Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Thu, 8 Jul 2021 20:40:49 +0800 Subject: [PATCH 43/82] remove redundant sbp check --- oneflow/user/ops/dim_scatter_ops.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 738806e0350..f1392ce8fae 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -162,16 +162,6 @@ Maybe SetSbpScatter(user_op::SbpContext* ctx) { return Maybe::Ok(); } -// Maybe SetSbpInfer(user_op::InferSbpSignatureFnContext* ctx) { -// int32_t dim = ctx->Attr("dim"); -// const cfg::SbpParallel input_sbp = ctx->SbpParallelHint4InputArgNameAndIndex("input", 0); -// int64_t split_axis = input_sbp.split_parallel().axis(); -// if (ctx->parallel_num() != 1 && input_sbp.has_split_parallel()) { -// CHECK_NE_OR_RETURN(split_axis, dim) << "split_axis should NOT equal dim"; -// } -// return Maybe::Ok(); -// } - Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); From 4a10851ad7cf11d1c028e3770dc51d186142f696 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 10:44:02 +0800 Subject: [PATCH 44/82] add test --- .../python/test/ops/test_dim_scatter_ops.py | 303 +++++++++++------- 1 file changed, 193 insertions(+), 110 deletions(-) diff --git a/oneflow/python/test/ops/test_dim_scatter_ops.py b/oneflow/python/test/ops/test_dim_scatter_ops.py index 4facfcd9440..4f9218f713d 100644 --- a/oneflow/python/test/ops/test_dim_scatter_ops.py +++ b/oneflow/python/test/ops/test_dim_scatter_ops.py @@ -15,11 +15,12 @@ """ import oneflow as flow import numpy as np -import oneflow.typing as oft +import oneflow.typing as tp from test_util import GenArgList import unittest from collections import OrderedDict import os +import random flow.config.enable_debug_mode(True) @@ -34,7 +35,7 @@ def _bin_update(out_val, in_value): def gen_scatter_like_test_sample( - input_shape, + src_shape, index_shape, dim, like_shape, @@ -42,29 +43,32 @@ def gen_scatter_like_test_sample( binop=_bin_add, inplace=True, ): - def _np_dim_scatter_add_like(input, dim, index, like): + def _np_dim_scatter_add_like(like, dim, index, src): + out_shape = like.shape + flatten_idx = index.flatten() + if inplace: output = like.copy() else: - output = np.zeros(like.shape) - - for inputidx in range(0, input.size): - outcoord = np.unravel_index(inputidx, input.shape) - outcoord = [*outcoord] - outcoord[dim] = index[np.unravel_index(inputidx, index.shape)] - output_offset = np.ravel_multi_index(outcoord, like_shape) - output[np.unravel_index(output_offset, like_shape)] = binop( - output[np.unravel_index(output_offset, like_shape)], - input[np.unravel_index(inputidx, input.shape)], - ) + output = np.zeros(out_shape) + for idx in range(0, index.size): + idx_coord = list(np.unravel_index(idx, index.shape)) + idx_elem = flatten_idx[idx] + src_offset = np.ravel_multi_index(idx_coord, src.shape) + idx_coord[dim] = idx_elem + output_offset = np.ravel_multi_index(idx_coord, out_shape) + output[np.unravel_index(output_offset, out_shape)] = binop( + output[np.unravel_index(output_offset, out_shape)], + src[np.unravel_index(src_offset, src.shape)], + ) return output if is_float: - input = np.random.random(input_shape) + src = np.random.random(src_shape) like = np.random.random(like_shape) else: - input = np.random.randint(0, 100, input_shape) + src = np.random.randint(0, 100, src_shape) like = np.random.randint(0, 100, like_shape) def _np_dim_gather(dim, input, index): @@ -77,12 +81,24 @@ def _np_dim_gather(dim, input, index): output[tuple(outcoord)] = input[tuple(incoord)] return output - index = np.random.randint(0, like_shape[dim], index_shape) + shape_elemcnt = 1 + index_shape_list = list(index_shape) + for i in range(len(index_shape_list)): + shape_elemcnt *= index_shape_list[i] + + index_total = [] + for i in range(int(shape_elemcnt/like_shape[dim])): + index_arr = np.arange(0, like_shape[dim]) + random.shuffle(index_arr) + index_total.append(index_arr) + + index = np.stack(index_total) + + output = _np_dim_scatter_add_like(like, dim, index, src) - output = _np_dim_scatter_add_like(input, dim, index, like) grad = _np_dim_gather(dim, np.ones(output.shape), index) return { - "input": input, + "src": src, "index": index, "like": like, "dim": dim, @@ -92,6 +108,7 @@ def _np_dim_gather(dim, input, index): def _gen_arg_dict( + grad_flag=False, device_type="gpu", value_type="float", machine_ids="0:0", @@ -101,25 +118,29 @@ def _gen_arg_dict( inplace=True, ): arg_dict = OrderedDict() + arg_dict["grad_flag"] = [grad_flag] arg_dict["device_type"] = [device_type] arg_dict["samples"] = [] arg_dict["samples"].append( gen_scatter_like_test_sample( - (2, 2), - (2, 2), + (2, 3), + (2, 3), 1, - (2, 2), + (2, 3), is_float=value_type == "float", binop=binop, inplace=inplace, ) ) - # arg_dict["samples"].append(gen_scatter_like_test_sample((2, 2), (2, 2), 0, (4, 4), value_type=="float")) - # arg_dict["samples"].append(gen_scatter_like_test_sample((4, 3, 3), (4, 3, 3), 0, (5, 5, 5), value_type=="float")) if value_type == "float": - arg_dict["value_type"] = [ - (np.float32, flow.float32), - ] + if device_type == "cpu": + arg_dict["value_type"] = [ + (np.float32, flow.float32), + ] + else: + arg_dict["value_type"] = [ + (np.float32, flow.float32), + ] elif value_type == "int": arg_dict["value_type"] = [(np.float32, flow.int32)] else: @@ -134,7 +155,8 @@ def _gen_arg_dict( def _make_dim_scatter_add_like_fn( test_case, - input, + grad_flag, + src, index, dim, like, @@ -163,75 +185,120 @@ def _make_dim_scatter_add_like_fn( func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids)) func_config.default_logical_view(flow.scope.consistent_view()) - def _compare_diff(blob: oft.Numpy): + def _compare_diff(blob: tp.Numpy): test_case.assertTrue(np.allclose(grad, blob)) - if value_type == flow.float32 or value_type == flow.float64: - - @flow.global_function(type="train", function_config=func_config) - def scatter_add_like_fn( - params_def: oft.Numpy.Placeholder(input.shape, dtype=value_type), - indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), - like_def: oft.Numpy.Placeholder(like.shape, dtype=value_type), - ) -> oft.Numpy: - with flow.scope.placement(device_type, "0:0"): - x_var = flow.get_variable( - "input", - shape=input.shape, - dtype=value_type, - initializer=flow.constant_initializer(0), - ) - x_var = flow.cast_to_current_logical_view(x_var) - x = x_var + params_def - - y = flow_scatter_op(x, dim, indices_def, like_def) - - with flow.scope.placement(device_type, "0:0"): - flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 - ).minimize(y) - - flow.watch_diff(x, _compare_diff) - return y + if grad_flag: + if value_type == flow.float32 or value_type == flow.float64: + @flow.global_function(type="train", function_config=func_config) + def scatter_add_like_fn( + like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), + indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), + src_def: tp.Numpy.Placeholder(src.shape, dtype=value_type), + ) -> tp.Numpy: + with flow.scope.placement(device_type, "0:0"): + src_var = flow.get_variable( + "src", + shape=src.shape, + dtype=value_type, + initializer=flow.constant_initializer(0), + ) + src_var = flow.cast_to_current_logical_view(src_var) + src_tensor = src_var + src_def + + y = flow_scatter_op(like_def, dim, indices_def, src_tensor) + + with flow.scope.placement(device_type, "0:0"): + flow.optimizer.SGD( + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + ).minimize(y) + + flow.watch_diff(src_var, _compare_diff) + return y + + if value_type == flow.int32: + @flow.global_function(type="train", function_config=func_config) + def scatter_add_like_fn( + like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), + indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), + src_def: tp.Numpy.Placeholder(src.shape, dtype=flow.float32), + ) -> tp.Numpy: + with flow.scope.placement(device_type, "0:0"): + src_var = flow.get_variable( + "src", + shape=src_def.shape, + dtype=flow.float32, + initializer=flow.constant_initializer(0), + ) + src_var = flow.cast_to_current_logical_view(src_var) + src_tensor = src_var + src_def + + src_int32 = flow.cast(src_tensor, dtype=flow.int32) + like_def_int32 = flow.cast(like_def, dtype=flow.int32) + y_int32 = flow_scatter_op(like_def_int32, dim, indices_def, src_int32) + y_fp32 = flow.cast(y_int32, dtype=flow.float32) + + with flow.scope.placement(device_type, "0:0"): + flow.optimizer.SGD( + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + ).minimize(y_fp32) + + flow.watch_diff(src_int32, _compare_diff) + return y_fp32 return scatter_add_like_fn - if value_type == flow.int32: - - @flow.global_function(type="train", function_config=func_config) - def scatter_add_like_fn( - params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32), - indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), - like_def: oft.Numpy.Placeholder(like.shape, dtype=flow.float32), - ) -> oft.Numpy: - with flow.scope.placement(device_type, "0:0"): - x_var = flow.get_variable( - "input", - shape=params_def.shape, - dtype=flow.float32, - initializer=flow.constant_initializer(0), - ) - x_var = flow.cast_to_current_logical_view(x_var) - x = x_var + params_def - - x_int32 = flow.cast(x, dtype=flow.int32) - like_def_int32 = flow.cast(like_def, dtype=flow.int32) - y_int32 = flow_scatter_op(x_int32, dim, indices_def, like_def_int32) - y_fp32 = flow.cast(y_int32, dtype=flow.int32) - - with flow.scope.placement(device_type, "0:0"): - flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 - ).minimize(y_fp32) - - flow.watch_diff(x, _compare_diff) - return y_fp32 - + else: + if value_type == flow.float32 or value_type == flow.float64: + @flow.global_function(type="predict", function_config=func_config) + def scatter_add_like_fn( + like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), + indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), + src_def: tp.Numpy.Placeholder(src.shape, dtype=value_type), + ) -> tp.Numpy: + with flow.scope.placement(device_type, "0:0"): + src_var = flow.get_variable( + "src", + shape=src.shape, + dtype=value_type, + initializer=flow.constant_initializer(0), + ) + src_var = flow.cast_to_current_logical_view(src_var) + src_tensor = src_var + src_def + + y = flow_scatter_op(like_def, dim, indices_def, src_tensor) + return y + + if value_type == flow.int32: + @flow.global_function(type="predict", function_config=func_config) + def scatter_add_like_fn( + like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), + indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), + src_def: tp.Numpy.Placeholder(src.shape, dtype=flow.float32), + ) -> tp.Numpy: + with flow.scope.placement(device_type, "0:0"): + src_var = flow.get_variable( + "src", + shape=src_def.shape, + dtype=flow.float32, + initializer=flow.constant_initializer(0), + ) + src_var = flow.cast_to_current_logical_view(src_var) + src_tensor = src_var + src_def + + src_int32 = flow.cast(src_tensor, dtype=flow.int32) + like_def_int32 = flow.cast(like_def, dtype=flow.int32) + y_int32 = flow_scatter_op(like_def_int32, dim, indices_def, src_int32) + y_fp32 = flow.cast(y_int32, dtype=flow.int32) + + return y_fp32 + return scatter_add_like_fn def _compare_dim_scatter_op_like_with_samples( test_case, + grad_flag, device_type, sample, value_type, @@ -242,10 +309,11 @@ def _compare_dim_scatter_op_like_with_samples( ): scatter_add_like_fn = _make_dim_scatter_add_like_fn( test_case, - sample["input"].astype(value_type[0]), + grad_flag, + sample["like"].astype(value_type[0]), sample["index"].astype(index_type[0]), sample["dim"], - sample["like"].astype(value_type[0]), + sample["src"].astype(value_type[0]), sample["grad"].astype(value_type[0]), device_type, value_type[1], @@ -255,13 +323,12 @@ def _compare_dim_scatter_op_like_with_samples( flow_scatter_op, ) y = scatter_add_like_fn( - sample["input"].astype(value_type[0]), - sample["index"].astype(index_type[0]), sample["like"].astype(value_type[0]), + sample["index"].astype(index_type[0]), + sample["src"].astype(value_type[0]) ) - y.astype(value_type[0]) - - if value_type == flow.float16: + y = y.astype(value_type[0]) + if value_type[1] == flow.float16: test_case.assertTrue( np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) ) @@ -270,17 +337,17 @@ def _compare_dim_scatter_op_like_with_samples( @flow.unittest.skip_unless_1n1d() -class TestDimScatterOpsLike1n1d(flow.unittest.TestCase): +class TestDimScatterOps1n1d(flow.unittest.TestCase): def test_dim_scatter_add_like_int_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + False, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_add_like_float_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + False, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -288,7 +355,7 @@ def test_dim_scatter_add_like_float_cpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_int_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + False, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -296,13 +363,14 @@ def test_dim_scatter_add_like_int_gpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + False, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_update_like_int_cpu(test_case): arg_dict = _gen_arg_dict( + False, "cpu", "int", "0:0", @@ -316,6 +384,7 @@ def test_dim_scatter_update_like_int_cpu(test_case): def test_dim_scatter_update_like_float_cpu(test_case): arg_dict = _gen_arg_dict( + False, "cpu", "float", "0:0", @@ -330,6 +399,7 @@ def test_dim_scatter_update_like_float_cpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_int_gpu(test_case): arg_dict = _gen_arg_dict( + False, "gpu", "int", "0:0", @@ -344,6 +414,7 @@ def test_dim_scatter_update_like_int_gpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float_gpu(test_case): arg_dict = _gen_arg_dict( + False, "gpu", "float", "0:0", @@ -361,6 +432,7 @@ class TestDimScatterOpsLike1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): arg_dict = _gen_arg_dict( + False, "gpu", "float", "0:0-1", @@ -375,6 +447,7 @@ def test_dim_scatter_add_like_float(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float(test_case): arg_dict = _gen_arg_dict( + False, "gpu", "float", "0:0-1", @@ -391,44 +464,53 @@ def test_dim_scatter_update_like_float(test_case): class TestDimScatterOpsInplace1n1d(flow.unittest.TestCase): def test_dim_scatter_add_int_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + True, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_add_float_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + True, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_int_gpu(test_case): + def test_dim_scatter_add_int_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + True, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_float_gpu(test_case): + def test_dim_scatter_add_float_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + True, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_float_gpu(test_case): + arg_dict = _gen_arg_dict( + True, "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + def test_dim_scatter_update_like_int_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True + True, "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) def test_dim_scatter_update_like_float_cpu(test_case): arg_dict = _gen_arg_dict( - "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True + True, "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -436,7 +518,7 @@ def test_dim_scatter_update_like_float_cpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_int_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True + True, "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -444,7 +526,7 @@ def test_dim_scatter_update_like_int_gpu(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float_gpu(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True + True, "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -455,7 +537,7 @@ class TestDimScatterOpsInplace1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_add_like_float(test_case): arg_dict = _gen_arg_dict( - "gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add, inplace=True + True, "gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @@ -463,6 +545,7 @@ def test_dim_scatter_add_like_float(test_case): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") def test_dim_scatter_update_like_float(test_case): arg_dict = _gen_arg_dict( + True, "gpu", "float", "0:0-1", From b95fff68d267a489ddf1261b94861c216724380a Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 10:46:01 +0800 Subject: [PATCH 45/82] add float16n register --- oneflow/user/kernels/dim_scatter_scalar.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_scalar.cu b/oneflow/user/kernels/dim_scatter_scalar.cu index 6d02fe4d3f7..f2eb7d27235 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.cu +++ b/oneflow/user/kernels/dim_scatter_scalar.cu @@ -64,9 +64,6 @@ class GpuDimScatterScalarUpdateKernel final : public OpKernel { idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src_scalar, output); - // RUN_CUDA_KERNEL((ScatterScalarUpdateFunctor), ctx, BlocksNum4ThreadsNum(elem_cnt), - // idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, - // src_scalar, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; @@ -80,6 +77,8 @@ class GpuDimScatterScalarUpdateKernel final : public OpKernel { REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int32_t); +REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int64_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); From 7427322cc2c089ee50ac1b8570f7d3d22252b0d4 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 10:46:38 +0800 Subject: [PATCH 46/82] fix sbp --- oneflow/user/ops/dim_gather_op.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 6128f5b8cfb..0b8a275b679 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -41,10 +41,6 @@ REGISTER_USER_OP("dim_gather") CHECK_EQ_OR_RETURN(in->is_dynamic(), index->is_dynamic()); - FOR_RANGE(int64_t, i, 0, input_num_axes) { - if (i == dim) { continue; } - CHECK_EQ_OR_RETURN(in->shape().At(i), index->shape().At(i)); - } user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); *out->mut_shape() = index->shape(); From 88f7a877366abda1449f856c3016d3b8ece68bfc Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 10:47:03 +0800 Subject: [PATCH 47/82] fix sbp --- oneflow/user/ops/dim_scatter_ops.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index f1392ce8fae..7ecd110b6f8 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -68,7 +68,6 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); - // printf("infertensor ok"); return Maybe::Ok(); } @@ -113,7 +112,7 @@ Maybe InputScalarArgModifierFn(user_op::GetInputArgModifier GetInputArgMod return Maybe::Ok(); } -void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { +void _SetSbp(user_op::SbpContext* ctx, const char* like_or_input) { const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); int64_t index_num_axes = index_tensor.shape().NumAxes(); @@ -123,32 +122,32 @@ void _SetSbp(user_op::SbpContext* ctx, const char* like_or_src) { if (i != dim) { ctx->NewBuilder() .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("src", 0), i) .Split(user_op::OpArg("output", 0), i) - .Split(user_op::OpArg(like_or_src, 0), i) + .Split(user_op::OpArg(like_or_input, 0), i) .Build(); } else { ctx->NewBuilder() .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("src", 0), i) .PartialSum(user_op::OpArg("output", 0)) - .Broadcast(user_op::OpArg(like_or_src, 0)) + .Broadcast(user_op::OpArg(like_or_input, 0)) .Build(); ctx->NewBuilder() .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("src", 0), i) .PartialSum(user_op::OpArg("output", 0)) - .PartialSum(user_op::OpArg(like_or_src, 0)) + .PartialSum(user_op::OpArg(like_or_input, 0)) .Build(); } } ctx->NewBuilder() - .PartialSum(user_op::OpArg("input", 0)) + .PartialSum(user_op::OpArg("src", 0)) .Broadcast(user_op::OpArg("index", 0)) .PartialSum(user_op::OpArg("output", 0)) - .PartialSum(user_op::OpArg(like_or_src, 0)) + .PartialSum(user_op::OpArg(like_or_input, 0)) .Build(); } @@ -158,7 +157,7 @@ Maybe SetSbpLike(user_op::SbpContext* ctx) { } Maybe SetSbpScatter(user_op::SbpContext* ctx) { - _SetSbp(ctx, "src"); + _SetSbp(ctx, "input"); return Maybe::Ok(); } From 679c4cd0e93373bfd35a5de61f79313274b59544 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 12:01:39 +0800 Subject: [PATCH 48/82] add api doc --- oneflow/python/ops/array_ops.py | 221 +++++++++++++++++++++++--------- 1 file changed, 159 insertions(+), 62 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 1209305fece..53cbc17e9a9 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2648,44 +2648,30 @@ def nvtx_end( ) return op.InferAndTryRun().SoleOutputBlob() -def _check_scatter_blobs(input, dim, index, like_or_src): - assert dim < len(index.shape), ValueError( - "Value of dim is out of range(dim should be less than len(index.shape))" +def _check_scatter_blobs(input, dim, index, src): + _input_num_axes = len(input.shape) + _index_num_axes = len(index.shape) + _src_num_axes = len(src.shape) + # check index.numaxes == src.num_axes == input.numaxes + assert _input_num_axes == _index_num_axes, ValueError( + "The num axes of input should be equal to index's num axes" ) - assert len(input.shape) == len(index.shape) and len(input.shape) == len( - like_or_src.shape - ), ValueError("Number of dimensions of input, index and like/src should equal") - for i in range(0, len(input.shape)): - assert input.shape[i] == index.shape[i], ValueError( - "Shape of input and index should be same" - ) - assert input.shape[i] <= like_or_src.shape[i], ValueError( - "Shape like/src blob should be larger than input" + assert _index_num_axes == _src_num_axes, ValueError( + "The num axes of input, index, src should be equal" + ) + assert dim < _index_num_axes, ValueError( + "Value of dim is out of range(dim should be less than the num axes of index)" + ) + + for i in range(0, _input_num_axes): + assert index.shape[i] <= input.shape[i], ValueError( + "Shape of input should be larger than index" ) -@oneflow_export("dim_scatter_update_like") -@stable_api -def dim_scatter_update_like( - input: oneflow._oneflow_internal.BlobDesc, - dim: int, - index: oneflow._oneflow_internal.BlobDesc, - like: oneflow._oneflow_internal.BlobDesc, - name: Optional[str] = None, -) -> oneflow._oneflow_internal.BlobDesc: - _check_scatter_blobs(input, dim, index, like) - return ( - flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatterUpdateLike_") + assert index.shape[i] <= src.shape[i], ValueError( + "Shape of src should be larger than index" ) - .Op("dim_scatter_update_like") - .Input("input", [input]) - .Input("index", [index]) - .Input("like", [like]) - .Output("output") - .Attr("dim", int(dim)) - .Build() - .InferAndTryRun() - .RemoteBlobList()[0] - ) + + @oneflow_export("dim_scatter_update") @stable_api def dim_scatter_update( @@ -2695,43 +2681,97 @@ def dim_scatter_update( src: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None, ) -> oneflow._oneflow_internal.BlobDesc: - return ( - flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatterUpdate_") + r"""This operator writes the elements specified by `index` along with the axis + `dim` from the `src` into the `input`. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + output[index[i][j][k]][j][k] = input[i][j][k] # if dim == 0 + output[i][index[i][j][k]][k] = input[i][j][k] # if dim == 1 + output[i][j][index[i][j][k]] = input[i][j][k] # if dim == 2 + + input, index and src (if it is a Tensor) should all have the same number of dimensions. + It is also required that index.shape(d) <= src.shape(d) for all dimensions d, + and that index.shape(d) <= self.shape(d) for all dimensions d != dim. + Note that index and src do not broadcast. + + Args: + input (oneflow._oneflow_internal.BlobDesc): The input blob. + dim (int): The axis along which to index + index (oneflow._oneflow_internal.BlobDesc): The index blob of elements to scatter. + src (oneflow._oneflow_internal.BlobDesc): The source blob whose elements will be scatterd and updated to output. + name (Optional[str], optional):The name of the operation. Defaults to None. + + Returns: + oneflow._oneflow_internal.BlobDesc: The scatterd Blob. + + For example: + + .. code-block:: python + + import oneflow as flow + import oneflow.typing as tp + import numpy as np + + + @flow.global_function() + def dim_scatter_update_job( + input: tp.Numpy.Placeholder((3, 5), dtype=flow.float32), + index: tp.Numpy.Placeholder((2, 3), dtype=flow.int32), + src: tp.Numpy.Placeholder((2, 5), dtype=flow.float32), + ) -> tp.Numpy: + return flow.dim_scatter_update(input, 1, index, src) + + + input = np.ones(shape=(3, 5), dtype=np.float32) + index = np.array([[0, 1, 2], + [0, 1, 4]], dtype=np.int32) + src = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10]], dtype=np.float32) + out = dim_scatter_update_job(input, index, src) + print(out) + + # out [[1. 2. 3. 1. 1.] + # [6. 7. 1. 1. 8.] + # [1. 1. 1. 1. 1.]] + + """ + + if type(src) is oneflow._oneflow_internal.LazyConsistentBlob: + _check_scatter_blobs(input, dim, index, src) + return ( + flow.user_op_builder( + name if name is not None else id_util.UniqueStr("DimScatterUpdate_") + ) + .Op("dim_scatter_update") + .Input("input", [input]) + .Input("index", [index]) + .Input("src", [src]) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] ) - .Op("dim_scatter_update") - .Input("input", [input]) - .Input("index", [index]) - .Input("src", [src]) - .Output("output") - .Attr("dim", int(dim)) - .Build() - .InferAndTryRun() - .RemoteBlobList()[0] - ) -@oneflow_export("dim_scatter_add_like") -@stable_api -def dim_scatter_add_like( - input: oneflow._oneflow_internal.BlobDesc, - dim: int, - index: oneflow._oneflow_internal.BlobDesc, - like: oneflow._oneflow_internal.BlobDesc, - name: Optional[str] = None, -) -> oneflow._oneflow_internal.BlobDesc: - return ( + else: + return ( flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatterAddLike_") + name if name is not None else id_util.UniqueStr("DimScatterScalarUpdate_") ) - .Op("dim_scatter_add_like") + .Op("dim_scatter_scalar_update") .Input("input", [input]) .Input("index", [index]) - .Input("like", [like]) + .Attr("src_scalar", float(src)) .Output("output") .Attr("dim", int(dim)) .Build() .InferAndTryRun() .RemoteBlobList()[0] ) + + @oneflow_export("dim_scatter_add") @stable_api def dim_scatter_add( @@ -2741,6 +2781,63 @@ def dim_scatter_add( src: oneflow._oneflow_internal.BlobDesc, name: Optional[str] = None, ) -> oneflow._oneflow_internal.BlobDesc: + r"""This operator adds the elements specified by `index` along with the axis + `dim` from the `src` into the `input`. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + output[index[i][j][k]][j][k] += input[i][j][k] # if dim == 0 + output[i][index[i][j][k]][k] += input[i][j][k] # if dim == 1 + output[i][j][index[i][j][k]] += input[i][j][k] # if dim == 2 + + input, index and src (if it is a Tensor) should all have the same number of dimensions. + It is also required that index.shape(d) <= src.shape(d) for all dimensions d, + and that index.shape(d) <= self.shape(d) for all dimensions d != dim. + Note that index and src do not broadcast. + + Args: + input (oneflow._oneflow_internal.BlobDesc): The input blob. + dim (int): The axis along which to index + index (oneflow._oneflow_internal.BlobDesc): The index blob of elements to scatter. + src (oneflow._oneflow_internal.BlobDesc): The source blob whose elements will be scatterd and added to output. + name (Optional[str], optional):The name of the operation. Defaults to None. + + Returns: + oneflow._oneflow_internal.BlobDesc: The scatterd Blob. + + For example: + + .. code-block:: python + + import oneflow as flow + import oneflow.typing as tp + import numpy as np + + + @flow.global_function() + def dim_scatter_add_job( + input: tp.Numpy.Placeholder((3, 5), dtype=flow.float32), + index: tp.Numpy.Placeholder((2, 3), dtype=flow.int32), + src: tp.Numpy.Placeholder((2, 5), dtype=flow.float32), + ) -> tp.Numpy: + return flow.dim_scatter_add(input, 1, index, src) + + + input = np.ones(shape=(3, 5), dtype=np.float32) + index = np.array([[0, 1, 2], + [0, 1, 4]], dtype=np.int32) + src = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10]], dtype=np.float32) + out = dim_scatter_add_job(input, index, src) + print(out) + + # out [[2. 3. 4. 1. 1.] + # [7. 8. 1. 1. 9.] + # [1. 1. 1. 1. 1.]] + + """ return ( flow.user_op_builder( name if name is not None else id_util.UniqueStr("DimScatterAdd_") @@ -2754,4 +2851,4 @@ def dim_scatter_add( .Build() .InferAndTryRun() .RemoteBlobList()[0] - ) \ No newline at end of file + ) From 1032c5f8597de16c1f35158fa51710b4054b24f9 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 12:04:04 +0800 Subject: [PATCH 49/82] make format --- oneflow/python/ops/array_ops.py | 33 +-- .../python/test/ops/test_dim_scatter_ops.py | 213 ++++++------------ .../user/kernels/dim_gather_scatter_util.h | 77 ++++--- .../user/kernels/dim_scatter_kernel_util.h | 26 ++- oneflow/user/kernels/dim_scatter_kernels.cpp | 81 +++---- oneflow/user/kernels/dim_scatter_scalar.cpp | 59 +++-- oneflow/user/kernels/dim_scatter_scalar.cu | 76 ++++--- oneflow/user/kernels/dim_scatter_scalar.h | 49 ++-- oneflow/user/ops/dim_gather_op.cpp | 133 ++++++----- oneflow/user/ops/dim_scatter_ops.cpp | 198 ++++++++-------- 10 files changed, 462 insertions(+), 483 deletions(-) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 53cbc17e9a9..e870d94afce 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2648,6 +2648,7 @@ def nvtx_end( ) return op.InferAndTryRun().SoleOutputBlob() + def _check_scatter_blobs(input, dim, index, src): _input_num_axes = len(input.shape) _index_num_axes = len(index.shape) @@ -2662,7 +2663,7 @@ def _check_scatter_blobs(input, dim, index, src): assert dim < _index_num_axes, ValueError( "Value of dim is out of range(dim should be less than the num axes of index)" ) - + for i in range(0, _input_num_axes): assert index.shape[i] <= input.shape[i], ValueError( "Shape of input should be larger than index" @@ -2739,7 +2740,7 @@ def dim_scatter_update_job( """ - if type(src) is oneflow._oneflow_internal.LazyConsistentBlob: + if type(src) is oneflow._oneflow_internal.LazyConsistentBlob: _check_scatter_blobs(input, dim, index, src) return ( flow.user_op_builder( @@ -2755,21 +2756,23 @@ def dim_scatter_update_job( .InferAndTryRun() .RemoteBlobList()[0] ) - else: + else: return ( - flow.user_op_builder( - name if name is not None else id_util.UniqueStr("DimScatterScalarUpdate_") + flow.user_op_builder( + name + if name is not None + else id_util.UniqueStr("DimScatterScalarUpdate_") + ) + .Op("dim_scatter_scalar_update") + .Input("input", [input]) + .Input("index", [index]) + .Attr("src_scalar", float(src)) + .Output("output") + .Attr("dim", int(dim)) + .Build() + .InferAndTryRun() + .RemoteBlobList()[0] ) - .Op("dim_scatter_scalar_update") - .Input("input", [input]) - .Input("index", [index]) - .Attr("src_scalar", float(src)) - .Output("output") - .Attr("dim", int(dim)) - .Build() - .InferAndTryRun() - .RemoteBlobList()[0] - ) @oneflow_export("dim_scatter_add") diff --git a/oneflow/python/test/ops/test_dim_scatter_ops.py b/oneflow/python/test/ops/test_dim_scatter_ops.py index 4f9218f713d..73182652d6c 100644 --- a/oneflow/python/test/ops/test_dim_scatter_ops.py +++ b/oneflow/python/test/ops/test_dim_scatter_ops.py @@ -46,14 +46,14 @@ def gen_scatter_like_test_sample( def _np_dim_scatter_add_like(like, dim, index, src): out_shape = like.shape flatten_idx = index.flatten() - + if inplace: output = like.copy() else: output = np.zeros(out_shape) for idx in range(0, index.size): - idx_coord = list(np.unravel_index(idx, index.shape)) + idx_coord = list(np.unravel_index(idx, index.shape)) idx_elem = flatten_idx[idx] src_offset = np.ravel_multi_index(idx_coord, src.shape) idx_coord[dim] = idx_elem @@ -87,7 +87,7 @@ def _np_dim_gather(dim, input, index): shape_elemcnt *= index_shape_list[i] index_total = [] - for i in range(int(shape_elemcnt/like_shape[dim])): + for i in range(int(shape_elemcnt / like_shape[dim])): index_arr = np.arange(0, like_shape[dim]) random.shuffle(index_arr) index_total.append(index_arr) @@ -108,13 +108,13 @@ def _np_dim_gather(dim, input, index): def _gen_arg_dict( - grad_flag=False, + grad_flag=False, device_type="gpu", value_type="float", machine_ids="0:0", device_count=1, binop=_bin_add, - dim_scatter_op=flow.dim_scatter_add_like, + dim_scatter_op=flow.dim_scatter_add, inplace=True, ): arg_dict = OrderedDict() @@ -133,11 +133,11 @@ def _gen_arg_dict( ) ) if value_type == "float": - if device_type == "cpu": + if device_type == "cpu": arg_dict["value_type"] = [ (np.float32, flow.float32), ] - else: + else: arg_dict["value_type"] = [ (np.float32, flow.float32), ] @@ -155,7 +155,7 @@ def _gen_arg_dict( def _make_dim_scatter_add_like_fn( test_case, - grad_flag, + grad_flag, src, index, dim, @@ -188,8 +188,9 @@ def _make_dim_scatter_add_like_fn( def _compare_diff(blob: tp.Numpy): test_case.assertTrue(np.allclose(grad, blob)) - if grad_flag: + if grad_flag: if value_type == flow.float32 or value_type == flow.float64: + @flow.global_function(type="train", function_config=func_config) def scatter_add_like_fn( like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), @@ -210,13 +211,15 @@ def scatter_add_like_fn( with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), + momentum=0, ).minimize(y) flow.watch_diff(src_var, _compare_diff) return y if value_type == flow.int32: + @flow.global_function(type="train", function_config=func_config) def scatter_add_like_fn( like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), @@ -240,7 +243,8 @@ def scatter_add_like_fn( with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 + flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), + momentum=0, ).minimize(y_fp32) flow.watch_diff(src_int32, _compare_diff) @@ -250,6 +254,7 @@ def scatter_add_like_fn( else: if value_type == flow.float32 or value_type == flow.float64: + @flow.global_function(type="predict", function_config=func_config) def scatter_add_like_fn( like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), @@ -270,6 +275,7 @@ def scatter_add_like_fn( return y if value_type == flow.int32: + @flow.global_function(type="predict", function_config=func_config) def scatter_add_like_fn( like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), @@ -292,13 +298,13 @@ def scatter_add_like_fn( y_fp32 = flow.cast(y_int32, dtype=flow.int32) return y_fp32 - + return scatter_add_like_fn def _compare_dim_scatter_op_like_with_samples( test_case, - grad_flag, + grad_flag, device_type, sample, value_type, @@ -309,7 +315,7 @@ def _compare_dim_scatter_op_like_with_samples( ): scatter_add_like_fn = _make_dim_scatter_add_like_fn( test_case, - grad_flag, + grad_flag, sample["like"].astype(value_type[0]), sample["index"].astype(index_type[0]), sample["dim"], @@ -325,7 +331,7 @@ def _compare_dim_scatter_op_like_with_samples( y = scatter_add_like_fn( sample["like"].astype(value_type[0]), sample["index"].astype(index_type[0]), - sample["src"].astype(value_type[0]) + sample["src"].astype(value_type[0]), ) y = y.astype(value_type[0]) if value_type[1] == flow.float16: @@ -337,215 +343,132 @@ def _compare_dim_scatter_op_like_with_samples( @flow.unittest.skip_unless_1n1d() -class TestDimScatterOps1n1d(flow.unittest.TestCase): - def test_dim_scatter_add_like_int_cpu(test_case): +class TestDimScatterOpsInplace1n1d(flow.unittest.TestCase): + def test_dim_scatter_add_int_cpu(test_case): arg_dict = _gen_arg_dict( - False, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + True, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) - def test_dim_scatter_add_like_float_cpu(test_case): + def test_dim_scatter_add_float_cpu(test_case): arg_dict = _gen_arg_dict( - False, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + True, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_int_gpu(test_case): + def test_dim_scatter_add_int_gpu(test_case): arg_dict = _gen_arg_dict( - False, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + True, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_float_gpu(test_case): + def test_dim_scatter_add_float_gpu(test_case): arg_dict = _gen_arg_dict( - False, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add_like, inplace=False + True, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True + ) + for arg in GenArgList(arg_dict): + _compare_dim_scatter_op_like_with_samples(test_case, *arg) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_dim_scatter_add_float_gpu(test_case): + arg_dict = _gen_arg_dict( + True, + "gpu", + "float", + "0:0", + 1, + _bin_update, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) - def test_dim_scatter_update_like_int_cpu(test_case): + def test_dim_scatter_update_int_cpu(test_case): arg_dict = _gen_arg_dict( - False, + True, "cpu", "int", "0:0", 1, _bin_update, - flow.dim_scatter_update_like, - inplace=False, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) - def test_dim_scatter_update_like_float_cpu(test_case): + def test_dim_scatter_update_float_cpu(test_case): arg_dict = _gen_arg_dict( - False, + True, "cpu", "float", "0:0", 1, _bin_update, - flow.dim_scatter_update_like, - inplace=False, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_int_gpu(test_case): + def test_dim_scatter_update_int_gpu(test_case): arg_dict = _gen_arg_dict( - False, + True, "gpu", "int", "0:0", 1, _bin_update, - flow.dim_scatter_update_like, - inplace=False, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_float_gpu(test_case): + def test_dim_scatter_update_float_gpu(test_case): arg_dict = _gen_arg_dict( - False, + True, "gpu", "float", "0:0", 1, _bin_update, - flow.dim_scatter_update_like, - inplace=False, + flow.dim_scatter_update, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @flow.unittest.skip_unless_1n2d() -class TestDimScatterOpsLike1n2d(flow.unittest.TestCase): +class TestDimScatterOpsInplace1n2d(flow.unittest.TestCase): @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_float(test_case): + def test_dim_scatter_add_float(test_case): arg_dict = _gen_arg_dict( - False, + True, "gpu", "float", "0:0-1", 2, _bin_add, - flow.dim_scatter_add_like, - inplace=False, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_float(test_case): - arg_dict = _gen_arg_dict( - False, - "gpu", - "float", - "0:0-1", - 2, - _bin_update, - flow.dim_scatter_update_like, - inplace=False, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - -@flow.unittest.skip_unless_1n1d() -class TestDimScatterOpsInplace1n1d(flow.unittest.TestCase): - def test_dim_scatter_add_int_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - def test_dim_scatter_add_float_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_int_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - - def test_dim_scatter_update_like_int_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - def test_dim_scatter_update_like_float_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_int_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "int", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "float", "0:0", 1, _bin_update, flow.dim_scatter_update, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - -@flow.unittest.skip_unless_1n2d() -class TestDimScatterOpsInplace1n2d(flow.unittest.TestCase): - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_like_float(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "float", "0:0-1", 2, _bin_add, flow.dim_scatter_add, inplace=True + flow.dim_scatter_add, + inplace=True, ) for arg in GenArgList(arg_dict): _compare_dim_scatter_op_like_with_samples(test_case, *arg) @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_like_float(test_case): + def test_dim_scatter_update_float(test_case): arg_dict = _gen_arg_dict( - True, + True, "gpu", "float", "0:0-1", diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index ddb3fd27f7d..e1530dad5d0 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -42,63 +42,74 @@ template struct DeviceBinOp { OF_DEVICE_FUNC static void Add(const T* x, T* y) { #ifdef __CUDA_ARCH__ - cuda::atomic::Add(y, *x); + cuda::atomic::Add(y, *x); #else *y += *x; #endif } - OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } }; // ----- macros for scatter functors ----- -#define DECLARE_DIMSCATTER_FUNCTOR(binop) \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ - const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output); \ +#define DECLARE_DIMSCATTER_FUNCTOR(binop) \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ + const DimOpIndexNdHelper& idx_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ + const IDX_T* index, const IN_T* src, IN_T* output); \ } -#define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ - const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, \ - index, src, output, DeviceBinOp::binop); \ - } \ +#define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ + template \ + struct DimScatter##binop##Functor final { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ + const DimOpIndexNdHelper& idx_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ + const IDX_T* index, const IN_T* src, IN_T* output) { \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, \ + elem_cnt, dim, upper_bound, index, src, output, \ + DeviceBinOp::binop); \ + } \ } #define IMPLEMENT_DIMSCATTER_GPUFUNCTOR(binop) \ template \ - __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ - const DimOpIndexNdHelper idx_nd_helper, \ + __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ + const DimOpIndexNdHelper idx_nd_helper, \ const DimOpIndexNdHelper output_nd_helper, \ - const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, \ - src, output, DeviceBinOp::binop); \ + const int ndim, const int64_t elem_cnt, \ + const int32_t dim, const int64_t upper_bound, \ + const IDX_T* index, const IN_T* src, IN_T* output) { \ + DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, \ + dim, upper_bound, index, src, output, \ + DeviceBinOp::binop); \ } \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ - const int32_t dim, const int64_t upper_bound, const IDX_T* index, const IN_T* src, IN_T* output) { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ + const DimOpIndexNdHelper& idx_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ + const IDX_T* index, const IN_T* src, IN_T* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src, \ - output); \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ + upper_bound, index, src, output); \ } \ }; \ template \ struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, \ - const int32_t dim, const int64_t upper_bound, const IDX_T* index, const float16* src, float16* output) { \ + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ + const DimOpIndexNdHelper& idx_nd_helper, \ + const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ + const IDX_T* index, const float16* src, float16* output) { \ RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, \ - reinterpret_cast(src), reinterpret_cast(output)); \ + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ + upper_bound, index, reinterpret_cast(src), \ + reinterpret_cast(output)); \ } \ } diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 90510e636cf..7f9010af6ab 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -48,24 +48,26 @@ DECLARE_DIMSCATTER_FUNCTOR(Add); DECLARE_DIMSCATTER_FUNCTOR(Update); template -OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, +OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, - const IN_T* src, IN_T* output, BinaryOpFn bin_op) { + const DimOpIndexNdHelper& output_nd_helper, + const int ndim, const int64_t elem_cnt, const int32_t dim, + int64_t upper_bound, const IDX_T* index, const IN_T* src, + IN_T* output, BinaryOpFn bin_op) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk IDX_T idx_elem = index[idx_offset]; - if(idx_elem>=upper_bound){ - #if __CUDA_ARCH__ - __trap(); - #else - std::cout<<"The index element "<= upper_bound) { +#if __CUDA_ARCH__ + __trap(); +#else + std::cout << "The index element " << idx_elem << " is out of bounds for dimension " << dim + << " with size " << upper_bound << std::endl; throw Error::CheckFailedError(); - #endif +#endif } - IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); + IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); coordinate[dim] = idx_elem; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); bin_op(src + src_offset, output + output_offset); diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 0bd4ba30446..a6b0d8229e5 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -28,29 +28,31 @@ namespace user_op { ~DimScatter##binop##Kernel() override = default; \ \ private: \ - void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, \ + void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ + const DimOpIndexNdHelper& idx_nd_helper, \ const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, \ + int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, \ IN_T* output) const override { \ DimScatter##binop##Functor()( \ - ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src, output); \ + ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, \ + index, src, output); \ } \ bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ } -#define REGISTER_DIM_SCATTER_LIKE_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ +#define REGISTER_DIM_SCATTER_LIKE_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -#define REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int64_t, optypename, binop) \ +#define REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int32_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int64_t, optypename, binop) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int64_t, optypename, binop) \ REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int64_t, optypename, binop) #define REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop) \ @@ -58,7 +60,7 @@ namespace user_op { #ifdef WITH_CUDA #define REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else @@ -69,20 +71,19 @@ namespace user_op { REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop); \ REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop); - -#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)) \ - -#define REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop); \ +#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ + REGISTER_USER_KERNEL(optypename) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)) + +#define REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop); \ + REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop); \ REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t, optypename, binop); #define REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop) \ @@ -90,14 +91,14 @@ namespace user_op { #ifdef WITH_CUDA #define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ + REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); #else #define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) #endif // WITH_CUDA -#define REGISTER_SCATTER_KERNEL(optypename, binop) \ +#define REGISTER_SCATTER_KERNEL(optypename, binop) \ REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop); \ REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop); @@ -106,10 +107,11 @@ class DimScatterBaseKernel : public user_op::OpKernel { public: DimScatterBaseKernel() = default; ~DimScatterBaseKernel() override = default; - virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, - IN_T* output) const = 0; + virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, int64_t upper_bound, const IDX_T* index, + const IN_T* src, IN_T* output) const = 0; private: void Compute(KernelComputeContext* ctx) const override { @@ -132,7 +134,7 @@ class DimScatterBaseKernel : public user_op::OpKernel { } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { - std::cout<<"Unimplemented Error"< output_nd_helper(shape_vec.data(), ndim); int64_t upper_bound = 0; - if(input_tensor){ - upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound - } - else{ - upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + if (input_tensor) { + upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + } else { + upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound } BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, diff --git a/oneflow/user/kernels/dim_scatter_scalar.cpp b/oneflow/user/kernels/dim_scatter_scalar.cpp index c81fb082147..3bbe8b8d848 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar.cpp @@ -1,16 +1,31 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ #include "oneflow/user/kernels/dim_scatter_scalar.h" -namespace oneflow{ +namespace oneflow { + +namespace user_op { -namespace user_op{ +template +class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { + public: + CpuDimScatterScalarUpdateKernel() = default; + ~CpuDimScatterScalarUpdateKernel() = default; -template -class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { - public: - CpuDimScatterScalarUpdateKernel() = default; - ~CpuDimScatterScalarUpdateKernel() = default; - - private: + private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); @@ -30,7 +45,7 @@ class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { - std::cout<<"Unimplemented Error"< shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); + [](int32_t dim) -> IDX_T { return static_cast(dim); }); }; shape2dims(index_tensor->shape()); DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); @@ -48,24 +63,24 @@ class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { int64_t upper_bound = input_tensor->shape().At(dim); ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, index, src_scalar, output); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + index_tensor->shape().elem_cnt(), dim, upper_bound, + index, src_scalar, output); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_CPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ - .SetCreateFn>() \ +#define REGISTER_CPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); + & (user_op::HobDataType("index", 0) == GetDataType::value)); REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int64_t); REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int32_t); REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int64_t); - -} // namespace user_op -} // namespace oneflow +} // namespace user_op +} // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar.cu b/oneflow/user/kernels/dim_scatter_scalar.cu index f2eb7d27235..82edc6c8939 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.cu +++ b/oneflow/user/kernels/dim_scatter_scalar.cu @@ -1,28 +1,46 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ #ifdef WITH_CUDA #include "oneflow/user/kernels/dim_scatter_scalar.h" -namespace oneflow{ +namespace oneflow { -namespace user_op{ +namespace user_op { -namespace{ +namespace { -template -__global__ void DoCUDADimScatterScalarUpdate(const DimOpIndexNdHelper idx_nd_helper, - const DimOpIndexNdHelper output_nd_helper, - const int ndim, const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const IN_T src_scalar, IN_T* output) { - ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src_scalar, output); - } -} // namespace +template +__global__ void DoCUDADimScatterScalarUpdate(const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, + const int ndim, const int64_t elem_cnt, + const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T src_scalar, + IN_T* output) { + ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src_scalar, output); +} +} // namespace -template -class GpuDimScatterScalarUpdateKernel final : public OpKernel { - public: - GpuDimScatterScalarUpdateKernel() = default; - ~GpuDimScatterScalarUpdateKernel() = default; - - private: +template +class GpuDimScatterScalarUpdateKernel final : public OpKernel { + public: + GpuDimScatterScalarUpdateKernel() = default; + ~GpuDimScatterScalarUpdateKernel() = default; + + private: void Compute(KernelComputeContext* ctx) const override { const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); @@ -42,7 +60,7 @@ class GpuDimScatterScalarUpdateKernel final : public OpKernel { } else if (like_tensor) { Memset(ctx->device_ctx(), output, 0, out_bytes_size); } else { - std::cout<<"Unimplemented Error"< shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); + [](int32_t dim) -> IDX_T { return static_cast(dim); }); }; shape2dims(index_tensor->shape()); DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); @@ -58,14 +76,13 @@ class GpuDimScatterScalarUpdateKernel final : public OpKernel { DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); int64_t upper_bound = input_tensor->shape().At(dim); - int64_t elem_cnt = index_tensor->shape().elem_cnt(); - - RUN_CUDA_KERNEL((DoCUDADimScatterScalarUpdate), ctx->device_ctx(), BlocksNum4ThreadsNum(elem_cnt), - idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, - src_scalar, output); + int64_t elem_cnt = index_tensor->shape().elem_cnt(); + RUN_CUDA_KERNEL((DoCUDADimScatterScalarUpdate), ctx->device_ctx(), + BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src_scalar, output); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; #define REGISTER_GPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ @@ -73,7 +90,7 @@ class GpuDimScatterScalarUpdateKernel final : public OpKernel { .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); + & (user_op::HobDataType("index", 0) == GetDataType::value)); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); @@ -82,7 +99,6 @@ REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int64_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); - -} // namespace user_op -} // namespace oneflow +} // namespace user_op +} // namespace oneflow #endif diff --git a/oneflow/user/kernels/dim_scatter_scalar.h b/oneflow/user/kernels/dim_scatter_scalar.h index e990e417d2a..79c00e092d3 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.h +++ b/oneflow/user/kernels/dim_scatter_scalar.h @@ -1,3 +1,18 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ #ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ #define ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ #include "oneflow/core/device/device_context.h" @@ -6,9 +21,9 @@ #include "oneflow/core/framework/framework.h" #include "oneflow/core/common/data_type.h" -namespace oneflow{ +namespace oneflow { -namespace user_op{ +namespace user_op { constexpr int kDimGatherMaxDimCount = 8; @@ -17,29 +32,31 @@ using DimOpIndexNdHelper = NdIndexOffsetHelper; template OF_DEVICE_FUNC void ScatterScalarUpdateFunctor(const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, - const IN_T src, IN_T* output) { + const DimOpIndexNdHelper& output_nd_helper, + const int ndim, const int64_t elem_cnt, + const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk IDX_T idx_elem = index[idx_offset]; - if(idx_elem>=upper_bound){ - #if __CUDA_ARCH__ - __trap(); - #else - std::cout<<"The index element "<= upper_bound) { +#if __CUDA_ARCH__ + __trap(); +#else + std::cout << "The index element " << idx_elem << " is out of bounds for dimension " << dim + << " with size " << upper_bound << std::endl; throw Error::CheckFailedError(); - #endif +#endif } coordinate[dim] = idx_elem; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - *(output+output_offset) = src; + *(output + output_offset) = src; } } -} // namespace user op -} // namespace oneflow +} // namespace user_op +} // namespace oneflow -#endif // ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ +#endif // ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 0b8a275b679..9720d3632e6 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -19,77 +19,74 @@ limitations under the License. namespace oneflow { namespace user_op { - REGISTER_USER_OP("dim_gather") -.Input("input") -.Input("index") -.Output("output") -.Attr("dim") -.SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { - const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); - int64_t input_num_axes = in->shape().NumAxes(); - CHECK_GT_OR_RETURN(input_num_axes, 0); - CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); - - const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - int64_t index_num_axes = index->shape().NumAxes(); - - const int32_t dim = ctx->Attr("dim"); - CHECK_GE_OR_RETURN(dim, 0); - CHECK_LT_OR_RETURN(dim, input_num_axes); - CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); - - CHECK_EQ_OR_RETURN(in->is_dynamic(), index->is_dynamic()); - - - user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); - *out->mut_shape() = index->shape(); - - return Maybe::Ok(); -}) -.SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe { - const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); - CHECK_OR_RETURN(IsIndexDataType(index->data_type())); - const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); - user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); - *out->mut_data_type() = in->data_type(); - return Maybe::Ok(); -}) -.SetInputArgModifyFn([](user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); - CHECK(indices_modifier != nullptr); - indices_modifier->set_requires_grad(false); -}) -.SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { - const user_op::TensorDesc& index_tensor = - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); - int64_t index_num_axes = index_tensor.shape().NumAxes(); - const int32_t dim = ctx->Attr("dim"); - - FOR_RANGE(int64_t, i, 0, index_num_axes) { - if (i != dim) { - ctx->NewBuilder() - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("input", 0), i) - .Split(user_op::OpArg("output", 0), i) - .Build(); - } else if (i == dim) { + .Input("input") + .Input("index") + .Output("output") + .Attr("dim") + .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe { + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + int64_t input_num_axes = in->shape().NumAxes(); + CHECK_GT_OR_RETURN(input_num_axes, 0); + CHECK_LE_OR_RETURN(input_num_axes, kDimGatherMaxDimCount); + + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + int64_t index_num_axes = index->shape().NumAxes(); + + const int32_t dim = ctx->Attr("dim"); + CHECK_GE_OR_RETURN(dim, 0); + CHECK_LT_OR_RETURN(dim, input_num_axes); + CHECK_EQ_OR_RETURN(input_num_axes, index_num_axes); + + CHECK_EQ_OR_RETURN(in->is_dynamic(), index->is_dynamic()); + + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); + *out->mut_shape() = index->shape(); + + return Maybe::Ok(); + }) + .SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe { + const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); + CHECK_OR_RETURN(IsIndexDataType(index->data_type())); + const TensorDesc* in = ctx->TensorDesc4ArgNameAndIndex("input", 0); + user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); + *out->mut_data_type() = in->data_type(); + return Maybe::Ok(); + }) + .SetInputArgModifyFn([](user_op::GetInputArgModifier GetInputArgModifierFn, + const user_op::UserOpConfWrapper&) { + user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); + CHECK(indices_modifier != nullptr); + indices_modifier->set_requires_grad(false); + }) + .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe { + const user_op::TensorDesc& index_tensor = + ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); + int64_t index_num_axes = index_tensor.shape().NumAxes(); + const int32_t dim = ctx->Attr("dim"); + + FOR_RANGE(int64_t, i, 0, index_num_axes) { + if (i != dim) { + ctx->NewBuilder() + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("input", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Build(); + } else if (i == dim) { + ctx->NewBuilder() + .Broadcast(user_op::OpArg("input", 0)) + .Split(user_op::OpArg("index", 0), i) + .Split(user_op::OpArg("output", 0), i) + .Build(); + } + } ctx->NewBuilder() - .Broadcast(user_op::OpArg("input", 0)) - .Split(user_op::OpArg("index", 0), i) - .Split(user_op::OpArg("output", 0), i) + .PartialSum(user_op::OpArg("input", 0)) + .Broadcast(user_op::OpArg("index", 0)) + .PartialSum(user_op::OpArg("output", 0)) .Build(); - } - } - ctx->NewBuilder() - .PartialSum(user_op::OpArg("input", 0)) - .Broadcast(user_op::OpArg("index", 0)) - .PartialSum(user_op::OpArg("output", 0)) - .Build(); - return Maybe::Ok(); -}); - + return Maybe::Ok(); + }); REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { const auto op_grad_name = ctx->FwOp().op_name() + "_grad"; diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 7ecd110b6f8..ab344bc0170 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -38,8 +38,8 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { CHECK_LE_OR_RETURN(src_num_axes, kDimGatherMaxDimCount); int64_t index_num_axes = index->shape().NumAxes(); CHECK_EQ_OR_RETURN(src_num_axes, index_num_axes); - - int64_t output_num_axes = 0; + + int64_t output_num_axes = 0; if (input) { output_num_axes = input->shape().NumAxes(); } else if (like) { @@ -48,23 +48,22 @@ Maybe InferTensorDesc(user_op::InferContext* ctx) { throw Error::Unimplemented(); } CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); - + // check index.shape(i) <= input/like.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { - if(i==dim) continue; - if(input){ + if (i == dim) continue; + if (input) { CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); - } - else{ + } else { CHECK_LE_OR_RETURN(index->shape().At(i), like->shape().At(i)); - } } - + } + // check index.shape(i) <= src.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { - if(i==dim) continue; + if (i == dim) continue; CHECK_LE_OR_RETURN(index->shape().At(i), src->shape().At(i)); - } + } user_op::TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input ? input->shape() : like->shape(); @@ -81,13 +80,13 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { int64_t output_num_axes = input->shape().NumAxes(); int64_t index_num_axes = index->shape().NumAxes(); CHECK_EQ_OR_RETURN(output_num_axes, index_num_axes); - + // check index.shape(i) <= input/like.shape(i) FOR_RANGE(int64_t, i, 0, index_num_axes) { - if(i==dim) continue; + if (i == dim) continue; CHECK_LE_OR_RETURN(index->shape().At(i), input->shape().At(i)); } - + TensorDesc* out = ctx->TensorDesc4ArgNameAndIndex("output", 0); *out->mut_shape() = input->shape(); return Maybe::Ok(); @@ -95,26 +94,24 @@ Maybe InferScalarTensorDesc(user_op::InferContext* ctx) { Maybe InputArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, const user_op::UserOpConfWrapper&) { - user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); indices_modifier->set_requires_grad(false); - + return Maybe::Ok(); } Maybe InputScalarArgModifierFn(user_op::GetInputArgModifier GetInputArgModifierFn, - const user_op::UserOpConfWrapper&) { + const user_op::UserOpConfWrapper&) { user_op::InputArgModifier* indices_modifier = GetInputArgModifierFn("index", 0); CHECK(indices_modifier != nullptr); indices_modifier->set_requires_grad(false); - + return Maybe::Ok(); } void _SetSbp(user_op::SbpContext* ctx, const char* like_or_input) { - const user_op::TensorDesc& index_tensor = - ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); + const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0); int64_t index_num_axes = index_tensor.shape().NumAxes(); const int32_t dim = ctx->Attr("dim"); @@ -165,10 +162,9 @@ Maybe InferDtype(user_op::InferContext* ctx) { const TensorDesc* index = ctx->TensorDesc4ArgNameAndIndex("index", 0); CHECK_OR_RETURN(IsIndexDataType(index->data_type())); const TensorDesc* input = ctx->TensorDesc4ArgNameAndIndex("input", 0); - if(input){ + if (input) { CHECK_EQ_OR_RETURN(ctx->InputDType("input", 0), ctx->InputDType("src", 0)); - } - else{ + } else { CHECK_EQ_OR_RETURN(ctx->InputDType("like", 0), ctx->InputDType("src", 0)); } *ctx->OutputDType("output", 0) = ctx->InputDType("src", 0); @@ -188,105 +184,103 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { REGISTER_USER_OP(optypename) \ .Input("like") \ .Input("index") \ - .Input("src") \ + .Input("src") \ .Output("output") \ .Attr("dim") \ .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ - .SetDataTypeInferFn(InferDtype) \ + .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpLike) - -#define REGISTER_SCATTER_OP(optypename) \ - REGISTER_USER_OP(optypename) \ - .Input("input") \ - .Input("index") \ - .Input("src") \ - .Output("output") \ - .Attr("dim") \ - .SetTensorDescInferFn(InferTensorDesc) \ +#define REGISTER_SCATTER_OP(optypename) \ + REGISTER_USER_OP(optypename) \ + .Input("input") \ + .Input("index") \ + .Input("src") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferTensorDesc) \ .SetInputArgModifyFn(InputArgModifierFn) \ - .SetDataTypeInferFn(InferDtype) \ + .SetDataTypeInferFn(InferDtype) \ .SetGetSbpFn(SetSbpScatter) #define REGISTER_SCATTER_SCALAR_OP(optypename) \ - REGISTER_USER_OP(optypename) \ - .Input("input") \ - .Input("index") \ - .Attr("src_scalar") \ - .Output("output") \ - .Attr("dim") \ - .SetTensorDescInferFn(InferScalarTensorDesc) \ + REGISTER_USER_OP(optypename) \ + .Input("input") \ + .Input("index") \ + .Attr("src_scalar") \ + .Output("output") \ + .Attr("dim") \ + .SetTensorDescInferFn(InferScalarTensorDesc) \ .SetInputArgModifyFn(InputScalarArgModifierFn) \ - .SetDataTypeInferFn(InferScalarDtype) \ + .SetDataTypeInferFn(InferScalarDtype) \ .SetGetSbpFn(SetSbpScatter) -#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ - REGISTER_USER_OP_GRAD(optypename) \ - .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const TensorDesc& src = ctx->FwOp().TensorDesc4ArgNameAndIndex("src", 0); \ - const TensorDesc& index = ctx->FwOp().TensorDesc4ArgNameAndIndex("index", 0); \ - const int64_t ndim = src.shape().NumAxes(); \ - bool backprop_flag = true; \ - FOR_RANGE(int64_t, i, 0, ndim) { \ - if(index.shape().At(i)!=src.shape().At(i)){ \ - backprop_flag = false; \ - break; \ - } \ - }\ - if(backprop_flag){ \ - const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ - ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_gather") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ - [&ctx, &op_src_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_src_grad_name).output("output", 0); \ - }); \ - const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ +#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ + REGISTER_USER_OP_GRAD(optypename) \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ + const TensorDesc& src = ctx->FwOp().TensorDesc4ArgNameAndIndex("src", 0); \ + const TensorDesc& index = ctx->FwOp().TensorDesc4ArgNameAndIndex("index", 0); \ + const int64_t ndim = src.shape().NumAxes(); \ + bool backprop_flag = true; \ + FOR_RANGE(int64_t, i, 0, ndim) { \ + if (index.shape().At(i) != src.shape().At(i)) { \ + backprop_flag = false; \ + break; \ + } \ + } \ + if (backprop_flag) { \ + const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ + ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ + return builder.OpTypeName("dim_gather") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ + [&ctx, &op_src_grad_name]() -> const std::string& { \ + return ctx->GetOp(op_src_grad_name).output("output", 0); \ + }); \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_scatter_scalar_update") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Attr("src_scalar", static_cast(0.0)) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + return builder.OpTypeName("dim_scatter_scalar_update") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Attr("src_scalar", static_cast(0.0)) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ [&ctx, &op_input_grad_name]() -> const std::string& { \ return ctx->GetOp(op_input_grad_name).output("output", 0); \ - }); \ - } \ - else{ \ - std::cout<<"The backward pass is implemented only for src.shape == index.shape."<FwOp().op_name() + "_input_grad"; \ +#define REGISTER_USER_OP_GRAD_SCATTER_SCALAR(optypename) \ + REGISTER_USER_OP_GRAD(optypename) \ + .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_scatter_scalar_update") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Attr("src_scalar", static_cast(0.0)) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ + return builder.OpTypeName("dim_scatter_scalar_update") \ + .InputBind("index", ctx->FwOp().input("index", 0)) \ + .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ + .Output("output") \ + .Attr("dim", ctx->FwOp().attr("dim")) \ + .Attr("src_scalar", static_cast(0.0)) \ + .Build(); \ + }); \ + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ [&ctx, &op_input_grad_name]() -> const std::string& { \ return ctx->GetOp(op_input_grad_name).output("output", 0); \ - }); \ + }); \ }); REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); @@ -300,6 +294,6 @@ REGISTER_SCATTER_SCALAR_OP("dim_scatter_scalar_add"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); -REGISTER_USER_OP_GRAD_SCATTER_SCALAR("dim_scatter_scalar_update"); +REGISTER_USER_OP_GRAD_SCATTER_SCALAR("dim_scatter_scalar_update"); } // namespace user_op } // namespace oneflow \ No newline at end of file From f0da7ab8c3091327da3cd4a2464c6138d8476572 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 12 Jul 2021 13:27:24 +0800 Subject: [PATCH 50/82] add new line --- oneflow/user/kernels/dim_gather_scatter_util.h | 2 +- oneflow/user/kernels/dim_scatter_kernel_util.cpp | 2 +- oneflow/user/ops/dim_scatter_ops.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index e1530dad5d0..84550e8838f 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -166,4 +166,4 @@ struct DeviceBinOp { } // namespace user_op } // namespace oneflow -#endif \ No newline at end of file +#endif diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 577416c1ce0..04027840a68 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -27,4 +27,4 @@ IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Add); INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Update); } // namespace user_op -} // namespace oneflow \ No newline at end of file +} // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index ab344bc0170..9ad935f1f67 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -296,4 +296,4 @@ REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); REGISTER_USER_OP_GRAD_SCATTER_SCALAR("dim_scatter_scalar_update"); } // namespace user_op -} // namespace oneflow \ No newline at end of file +} // namespace oneflow From e1322b74049340868642d71f47ffd8255a01f36c Mon Sep 17 00:00:00 2001 From: YaoChi Date: Fri, 16 Jul 2021 08:49:33 +0800 Subject: [PATCH 51/82] refine --- oneflow/user/ops/dim_gather_op.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index dee56c24179..7e2d59b17d4 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -40,14 +40,6 @@ REGISTER_USER_OP("dim_gather") CHECK_EQ_OR_RETURN(in.is_dynamic(), index.is_dynamic()); -<<<<<<< HEAD -======= - FOR_RANGE(int64_t, i, 0, input_num_axes) { - if (i == dim) { continue; } - CHECK_EQ_OR_RETURN(in.shape().At(i), index.shape().At(i)); - } - ->>>>>>> master user_op::TensorDesc* out = ctx->OutputTensorDesc("output", 0); *out->mut_shape() = index.shape(); From 24f82e86b028d3148ef3fd72f0263a6782f0e28a Mon Sep 17 00:00:00 2001 From: YaoChi Date: Fri, 16 Jul 2021 11:02:45 +0800 Subject: [PATCH 52/82] revert dim gather --- .../user/kernels/dim_gather_kernel_util.cpp | 13 +- .../user/kernels/dim_gather_kernel_util.cu | 37 +++++- oneflow/user/kernels/dim_gather_kernel_util.h | 117 ++++++++++-------- oneflow/user/kernels/dim_gather_kernels.cpp | 94 ++++++-------- .../user/kernels/dim_gather_scatter_util.h | 50 -------- 5 files changed, 149 insertions(+), 162 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cpp b/oneflow/user/kernels/dim_gather_kernel_util.cpp index f8c893f0169..8262c70ba91 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cpp +++ b/oneflow/user/kernels/dim_gather_kernel_util.cpp @@ -20,8 +20,17 @@ namespace oneflow { namespace user_op { -IMPLEMENT_DIMGATHER_CPUFUNCTOR(Update); -INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Update); +template +struct DimGatherFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, + output); + } +}; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCPU), + DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ, INDEX_DATA_TYPE_SEQ); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu index e6d88cb69e1..c7b228aa893 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.cu +++ b/oneflow/user/kernels/dim_gather_kernel_util.cu @@ -14,15 +14,46 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef WITH_CUDA -// #include "oneflow/core/kernel/util/cuda_kernel_util.h" +#include "oneflow/core/framework/framework.h" #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { namespace user_op { -IMPLEMENT_DIMGATHER_GPUFUNCTOR(Update); -INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Update); +template +__global__ void DoCUDADimGather(const DimOpIndexNdHelper input_nd_helper, + const DimOpIndexNdHelper index_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { + DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, + output); +} + +template +struct DimGatherFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimGather), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, output); + } +}; + +// float16 special case of DimGatherFunctor template +template +struct DimGatherFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const float16* input, float16* output) { + RUN_CUDA_KERNEL((DoCUDADimGather), ctx, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, + reinterpret_cast(input), reinterpret_cast(output)); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kGPU), + DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ, INDEX_DATA_TYPE_SEQ); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernel_util.h b/oneflow/user/kernels/dim_gather_kernel_util.h index e717c9be736..6a12dcc0fe7 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.h +++ b/oneflow/user/kernels/dim_gather_kernel_util.h @@ -15,39 +15,48 @@ limitations under the License. */ #ifndef ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ -#include "oneflow/user/kernels/dim_gather_scatter_util.h" - -// Steps for adding a binary operation on gathers are as follows: -// 1. implment binop in DeviceBinOp, for example "Mul": -// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } -// -// 2. Declare Functor in dim_gather_kernel_util.h: -// DECLARE_DIMGATHER_FUNCTOR(Mul); -// -// 3. Implement functors in dim_gather_kernel_util.cu and cpp file: -// in .cu file: -// IMPLEMENT_DIMGATHER_GPUFUNCTOR(Mul); -// INSTANTIATE_DIM_GATHER_GPUFUNCTORS(Mul); -// in .cpp file: -// IMPLEMENT_DIMGATHER_CPUFUNCTOR(Mul); -// INSTANTIATE_DIM_GATHER_CPUFUNCTORS(Mul); -// -// 4. Implement kernels in dim_gather_kernels.cpp: -// IMPLEMENT_DIMGATHER_KERNEL_CLASS(Mul); -// -// 5. Register kernels in dim_gather_kernels.cpp: -// REGISTER_GATHER_OUTPLACE_KERNEL("dim_gather_mul_like", Mul); +#ifdef WITH_CUDA +#include "oneflow/core/cuda/atomic.cuh" +#endif // WITH_CUDA +#include "oneflow/core/ndarray/xpu_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" namespace oneflow { + +#define DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ + FLOATING_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +#define DIM_GATHER_SCATTER_DATA_TYPE_GPU_SEQ \ + DIM_GATHER_SCATTER_DATA_TYPE_CPU_SEQ \ + FLOAT16_DATA_TYPE_SEQ + +constexpr int kDimGatherMaxDimCount = 8; + +template +using DimOpIndexNdHelper = NdIndexOffsetHelper; + namespace user_op { -DECLARE_DIMGATHER_FUNCTOR(Update); +template +struct DimGatherFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output); +}; + +template +struct DimScatterAddFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim, const IDX_T* index, const IN_T* src, IN_T* output); +}; template -OF_DEVICE_FUNC void DoDimGatherBinop(const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, - const IN_T* input, IN_T* output, BinaryOpFn bin_op) { +OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { XPU_1D_KERNEL_LOOP(index_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; const IDX_T x = index[index_offset]; @@ -55,32 +64,40 @@ OF_DEVICE_FUNC void DoDimGatherBinop(const DimOpIndexNdHelper& input_nd_h coordinate[dim] = x; IDX_T input_offset = input_nd_helper.NdIndexToOffset(coordinate, ndim); - bin_op(input + input_offset, output + index_offset); + output[index_offset] = input[input_offset]; + } +} + +template +struct DeviceAdd { + OF_DEVICE_FUNC static void Invoke(const T* x, T* y) { +#ifdef __CUDA_ARCH__ + cuda::atomic::Add(y, *x); // TODO:(YaoChi), refine add using float16 -> half -> float -> half +#else + *y += *x; +#endif + }; +}; + +template +OF_DEVICE_FUNC void DoDimScatterAdd(const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim, const IDX_T* index, + const IN_T* input, IN_T* output) { + XPU_1D_KERNEL_LOOP(input_offset, elem_cnt) { + IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + input_nd_helper.OffsetToNdIndex(input_offset, coordinate, ndim); + coordinate[dim] = index[input_offset]; + + IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); + DeviceAdd::Invoke(input + input_offset, output + output_offset); } } -#define INSTANTIATE_DIM_GATHER_FUNCTOR(devicetype, dtype, itype, binop) \ - template struct DimGather##binop##Functor; - -#define INSTANTIATE_DIM_GATHER_GPUFUNCTORS(binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, int32_t, int32_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float, int32_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, double, int32_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float16, int32_t, binop) \ - \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, int32_t, int64_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float, int64_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, double, int64_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kGPU, float16, int64_t, binop) - -#define INSTANTIATE_DIM_GATHER_CPUFUNCTORS(binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, int32_t, int32_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, float, int32_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, double, int32_t, binop) \ - \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, int32_t, int64_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, float, int64_t, binop) \ - INSTANTIATE_DIM_GATHER_FUNCTOR(DeviceType::kCPU, double, int64_t, binop) +// macros for functors instantiate(used by dim_gather_kernel_util.cu and dim_gather_kernel_uti.cpp) +#define INSTANTIATE_DIM_GATHER_FUNCTOR(device_type_v, dtype_pair, itype_pair) \ + template struct DimGatherFunctor; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index ac2c62acbe8..fc37bcff723 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -14,69 +14,28 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/shape_view.h" +#include "oneflow/core/framework/framework.h" #include "oneflow/user/kernels/dim_gather_kernel_util.h" namespace oneflow { namespace user_op { -#define IMPLEMENT_DIMGATHER_KERNEL_CLASS(binop) \ - template \ - class DimGather##binop##Kernel final : public DimGatherBaseKernel { \ - public: \ - DimGather##binop##Kernel() = default; \ - ~DimGather##binop##Kernel() override = default; \ - \ - private: \ - void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, \ - IN_T* output) const override { \ - DimGather##binop##Functor()( \ - ctx, input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, output); \ - } \ - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ - }; - -#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -#define REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t, optypename, binop) +namespace { -#define REGISTER_DIM_GATHER_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); +template +void ConvertShape2Array(const ShapeView& shape_view, IDX_T* array, int64_t num_axis) { + FOR_RANGE(int64_t, i, 0, num_axis) { array[i] = shape_view.At(i); } +} -#ifdef WITH_CUDA -#define REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_GATHER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); -#else -#define REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop) -#endif // WITH_CUDA - -#define REGISTER_GATHER_KERNEL(optypename, binop) \ - REGISTER_DIM_GATHER_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_GATHER_GPUKERNELS(optypename, binop); +} // namespace template -class DimGatherBaseKernel : public user_op::OpKernel { +class DimGatherKernel final : public user_op::OpKernel { public: - DimGatherBaseKernel() = default; - ~DimGatherBaseKernel() override = default; - virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, const IDX_T* index, const IN_T* input, - IN_T* output) const = 0; + DimGatherKernel() = default; + ~DimGatherKernel() override = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -100,14 +59,35 @@ class DimGatherBaseKernel : public user_op::OpKernel { shape2dims(index_tensor->shape()); DimOpIndexNdHelper index_nd_helper(shape_vec.data(), ndim); - BinaryOp(ctx->device_ctx(), input_nd_helper, index_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, index, input, output); + DimGatherFunctor()( + ctx->device_ctx(), input_nd_helper, index_nd_helper, ndim, index_tensor->shape().elem_cnt(), + dim, index, input, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -IMPLEMENT_DIMGATHER_KERNEL_CLASS(Update); -REGISTER_GATHER_KERNEL("dim_gather", Update); +#define REGISTER_DIM_GATHER_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_gather") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +#define REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(device) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int32_t) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int32_t) \ + REGISTER_DIM_GATHER_KERNEL(device, int32_t, int32_t) \ + REGISTER_DIM_GATHER_KERNEL(device, float, int64_t) \ + REGISTER_DIM_GATHER_KERNEL(device, double, int64_t) \ + REGISTER_DIM_GATHER_KERNEL(device, int32_t, int64_t) + +REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kCPU); + +#ifdef WITH_CUDA +REGISTER_DIM_GATHER_KERNELS_WITH_DEVICE(DeviceType::kGPU); +REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int32_t); +REGISTER_DIM_GATHER_KERNEL(DeviceType::kGPU, float16, int64_t); +#endif // WITH_CUDA } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 84550e8838f..b689d5343ed 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -113,56 +113,6 @@ struct DeviceBinOp { } \ } -// ----- macros for gather functors ----- -#define DECLARE_DIMGATHER_FUNCTOR(binop) \ - template \ - struct DimGather##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output); \ - } - -#define IMPLEMENT_DIMGATHER_CPUFUNCTOR(binop) \ - template \ - struct DimGather##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ - DoDimGatherBinop(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ - input, output, DeviceBinOp::binop); \ - } \ - } - -#define IMPLEMENT_DIMGATHER_GPUFUNCTOR(binop) \ - template \ - __global__ void DoCUDADimGather##binop(const DimOpIndexNdHelper input_nd_helper, \ - const DimOpIndexNdHelper index_nd_helper, \ - int ndim, int64_t elem_cnt, int32_t dim, \ - const IDX_T* index, const IN_T* input, IN_T* output) { \ - DoDimGatherBinop(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ - input, output, DeviceBinOp::binop); \ - } \ - template \ - struct DimGather##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) { \ - RUN_CUDA_KERNEL((DoCUDADimGather##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, \ - output); \ - } \ - }; \ - template \ - struct DimGather##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& input_nd_helper, \ - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, const IDX_T* index, const float16* input, float16* output) { \ - RUN_CUDA_KERNEL((DoCUDADimGather##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, \ - reinterpret_cast(input), reinterpret_cast(output)); \ - } \ - }; - } // namespace user_op } // namespace oneflow From 01acd084eff7bdf3b4decab74f671c2172a6df21 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sat, 17 Jul 2021 15:12:20 +0800 Subject: [PATCH 53/82] extract dim_scatter_add --- oneflow/core/functional/functional_api.yaml | 8 ++ .../core/functional/impl/array_functor.cpp | 46 ++++++++ oneflow/user/kernels/dim_scatter_kernels.cpp | 110 +++++++++++++++++- oneflow/user/ops/dim_gather_op.cpp | 4 +- oneflow/user/ops/dim_scatter_ops.cpp | 1 - 5 files changed, 163 insertions(+), 6 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 5e33c253d16..578bcf0d82a 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -610,3 +610,11 @@ signature: "Tensor TensorGetItem(Tensor x, *, TensorIndex index)" bind_python: True +- name: "dim_scatter" + signature: "Tensor DimScatter(Tensor input, Tensor index, Tensor src, *, Int32 dim)" + bind_python: True + +- name: "dim_scatter_add" + signature: "Tensor DimScatterAdd(Tensor input, Tensor index, Tensor src, *, Int32 dim)" + bind_python: True + \ No newline at end of file diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index 1653768c3e3..9bb374fbca4 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -230,6 +230,50 @@ class DimGatherFunctor { std::shared_ptr op_; }; +class DimScatterFunctor { + public: + DimScatterFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_update") + .Input("input") + .Input("index") + .Input("src") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, + const std::shared_ptr& src, const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + return OpInterpUtil::Dispatch(*op_, {input, index, src}, attrs); + } + + private: + std::shared_ptr op_; +}; + +class DimScatterAddFunctor { + public: + DimScatterAddFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_add") + .Input("input") + .Input("index") + .Input("src") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, + const std::shared_ptr& src, const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + return OpInterpUtil::Dispatch(*op_, {input, index, src}, attrs); + } + + private: + std::shared_ptr op_; +}; + class GatherNdFunctor { public: GatherNdFunctor() { @@ -577,6 +621,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("Diag"); m.add_functor("DiagGrad"); m.add_functor("TensorGetItem"); + m.add_functor("DimScatter"); + m.add_functor("DimScatterAdd"); }; } // namespace functional diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index a6b0d8229e5..7b5edc22aca 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -20,6 +20,110 @@ limitations under the License. namespace oneflow { namespace user_op { +template +class DimScatterAddKernel final : public user_op::OpKernel { + public: + DimScatterAddKernel() = default; + ~DimScatterAddKernel() override = default; + + private: + void Compute(KernelComputeContext* ctx) const override { + const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); + const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); + const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); + const int32_t dim = ctx->Attr("dim"); + + const IDX_T* index = index_tensor->dptr(); + IN_T* output = out_tensor->mut_dptr(); + size_t out_bytes_size = + out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + + Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); + const IN_T* src = src_tensor->dptr(); + + if (input_tensor) { + Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); + } else if (like_tensor) { + Memset(ctx->device_ctx(), output, 0, out_bytes_size); + } else { + std::cout << "Unimplemented Error" << std::endl; + throw Error::Unimplemented(); + } + + const int ndim = src_tensor->shape().NumAxes(); + fixed_vector shape_vec(ndim); + auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { + std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), + [](int32_t dim) -> IDX_T { return static_cast(dim); }); + }; + shape2dims(src_tensor->shape()); + DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); + shape2dims(index_tensor->shape()); + DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); + shape2dims(out_tensor->shape()); + DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + + int64_t upper_bound = 0; + if (input_tensor) { + upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + } else { + upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound + } + + DimScatterAddFunctor()( + ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_add_like") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("like", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, double, int64_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, int32_t, int64_t); + +#ifdef WITH_CUDA +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, double, int64_t); +REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); +#endif // WITH_CUDA + +#define REGISTER_DIM_SCATTER_ADD_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_add") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, double, int64_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, int32_t, int64_t); + +#ifdef WITH_CUDA +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int64_t); +REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int64_t); +#endif // WITH_CUDA + #define IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop) \ template \ class DimScatter##binop##Kernel final : public DimScatterBaseKernel { \ @@ -164,12 +268,12 @@ class DimScatterBaseKernel : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); +// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); -REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_add_like", Add); +// REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_add_like", Add); REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_update_like", Update); -REGISTER_SCATTER_KERNEL("dim_scatter_add", Add); +// REGISTER_SCATTER_KERNEL("dim_scatter_add", Add); REGISTER_SCATTER_KERNEL("dim_scatter_update", Update); } // namespace user_op diff --git a/oneflow/user/ops/dim_gather_op.cpp b/oneflow/user/ops/dim_gather_op.cpp index 7e2d59b17d4..a8345787c70 100644 --- a/oneflow/user/ops/dim_gather_op.cpp +++ b/oneflow/user/ops/dim_gather_op.cpp @@ -95,10 +95,10 @@ REGISTER_USER_OP_GRAD("dim_gather").SetBackwardOpConfGenFn([](user_op::BackwardO ctx->DefineOp(op_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { return builder .OpTypeName( - "dim_scatter_add_like") // dim_scatter_add_like(like, dim, index, input) -> output + "dim_scatter_add_like") // dim_scatter_add_like(like, dim, index, src) -> output .InputBind("index", ctx->FwOp().input("index", 0)) // scatter.index <- gather.index .InputBind("src", - ctx->FwOp().output_grad("output", 0)) // scatter.input <- grad of gather.out + ctx->FwOp().output_grad("output", 0)) // scatter.src <- grad of gather.out .InputBind("like", ctx->FwOp().input("input", 0)) .Output("output") .Attr("dim", ctx->FwOp().attr("dim")) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 9ad935f1f67..36cabc906ad 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -13,7 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "oneflow/core/common/error.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/framework/user_op_registry.h" From cf40ffc840970e263b6bd90113ac39465bba5bea Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sat, 17 Jul 2021 15:40:33 +0800 Subject: [PATCH 54/82] extracat scatter update ops --- oneflow/user/kernels/dim_scatter_kernels.cpp | 148 +++++++------------ 1 file changed, 50 insertions(+), 98 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 7b5edc22aca..34360c0eca2 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -124,98 +124,11 @@ REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int64_t); REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int64_t); #endif // WITH_CUDA -#define IMPLEMENT_DIMSCATTER_KERNEL_CLASS(binop) \ - template \ - class DimScatter##binop##Kernel final : public DimScatterBaseKernel { \ - public: \ - DimScatter##binop##Kernel() = default; \ - ~DimScatter##binop##Kernel() override = default; \ - \ - private: \ - void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ - const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, int ndim, int64_t elem_cnt, \ - int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T* src, \ - IN_T* output) const override { \ - DimScatter##binop##Functor()( \ - ctx, src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, \ - index, src, output); \ - } \ - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } \ - } - -#define REGISTER_DIM_SCATTER_LIKE_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("like", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -#define REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int32_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, float, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, double, int64_t, optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(device, int32_t, int64_t, optypename, binop) - -#define REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); - -#ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_LIKE_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_LIKE_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); -#else -#define REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop) -#endif // WITH_CUDA - -#define REGISTER_SCATTER_LIKE_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_LIKE_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_SCATTER_LIKE_GPUKERNELS(optypename, binop); - -#define REGISTER_DIM_SCATTER_KERNEL(device, dtype, itype, optypename, binop) \ - REGISTER_USER_KERNEL(optypename) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)) - -#define REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(device, optypename, binop) \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, float, int64_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, double, int64_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(device, int32_t, int64_t, optypename, binop); - -#define REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kCPU, optypename, binop); - -#ifdef WITH_CUDA -#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) \ - REGISTER_DIM_SCATTER_BINOP_KERNELS_DEVICE(DeviceType::kGPU, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int32_t, optypename, binop); \ - REGISTER_DIM_SCATTER_KERNEL(DeviceType::kGPU, float16, int64_t, optypename, binop); -#else -#define REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop) -#endif // WITH_CUDA - -#define REGISTER_SCATTER_KERNEL(optypename, binop) \ - REGISTER_DIM_SCATTER_CPUKERNELS(optypename, binop); \ - REGISTER_DIM_SCATTER_GPUKERNELS(optypename, binop); - template -class DimScatterBaseKernel : public user_op::OpKernel { +class DimScatterUpdateKernel final : public user_op::OpKernel { public: - DimScatterBaseKernel() = default; - ~DimScatterBaseKernel() override = default; - virtual void BinaryOp(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, - const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim, int64_t upper_bound, const IDX_T* index, - const IN_T* src, IN_T* output) const = 0; + DimScatterUpdateKernel() = default; + ~DimScatterUpdateKernel() override = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -262,19 +175,58 @@ class DimScatterBaseKernel : public user_op::OpKernel { upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound } - BinaryOp(ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); + DimScatterUpdateFunctor()( + ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, + index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Add); -IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Update); +#define REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_update_like") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("like", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, double, int64_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, int32_t, int64_t); + +#ifdef WITH_CUDA +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, double, int64_t); +REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); +#endif // WITH_CUDA -// REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_add_like", Add); -REGISTER_SCATTER_LIKE_KERNEL("dim_scatter_update_like", Update); -// REGISTER_SCATTER_KERNEL("dim_scatter_add", Add); -REGISTER_SCATTER_KERNEL("dim_scatter_update", Update); +#define REGISTER_DIM_SCATTER_UPDATE_KERNEL(device, dtype, itype) \ + REGISTER_USER_KERNEL("dim_scatter_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ + & (user_op::HobDataType("input", 0) == GetDataType::value) \ + & (user_op::HobDataType("index", 0) == GetDataType::value)); + +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, double, int64_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, int32_t, int64_t); + +#ifdef WITH_CUDA +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, int32_t, int32_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, double, int64_t); +REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, int32_t, int64_t); +#endif // WITH_CUDA } // namespace user_op } // namespace oneflow From d4cda6d8da862a8f546783766b88029e84a4ec97 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sat, 17 Jul 2021 15:54:38 +0800 Subject: [PATCH 55/82] add add/update functor --- oneflow/user/kernels/dim_gather_scatter_util.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index b689d5343ed..aa321a472cf 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -50,6 +50,22 @@ struct DeviceBinOp { OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } }; +template +struct BinOpAddFunctor { + OF_DEVICE_FUNC static void apply(const T* x, T* y) { +#ifdef __CUDA_ARCH__ + cuda::atomic::Add(y, *x); +#else + *y += *x; +#endif + } +}; + +template +struct BinOpUpdateFunctor { + OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } +}; + // ----- macros for scatter functors ----- #define DECLARE_DIMSCATTER_FUNCTOR(binop) \ template \ From e4a56ad26ef8f3ac082e16ad7e59e75af4c4592b Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 09:04:57 +0800 Subject: [PATCH 56/82] rewrting by functors --- .../user/kernels/dim_gather_scatter_util.h | 11 +- .../user/kernels/dim_scatter_kernel_util.cpp | 35 ++++- .../user/kernels/dim_scatter_kernel_util.cu | 57 +++++++- .../user/kernels/dim_scatter_kernel_util.h | 53 +++++++- oneflow/user/kernels/dim_scatter_kernels.cpp | 126 +++++++++--------- 5 files changed, 208 insertions(+), 74 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index aa321a472cf..259735fadc9 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -63,7 +63,16 @@ struct BinOpAddFunctor { template struct BinOpUpdateFunctor { - OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } + OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y = *x; } +}; + +template class Opt> +struct DimScatterFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output); }; // ----- macros for scatter functors ----- diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 04027840a68..ffe45bc275b 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -21,10 +21,37 @@ limitations under the License. namespace oneflow { namespace user_op { -IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Add); -IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); +template class Opt> +struct DimScatterFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output) { + DoDimScatter(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src, output); + } +}; -INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Add); -INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Update); +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; + +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; + +// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); +// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Update); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index bb58e41dc60..45a8f135dcb 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -20,11 +20,60 @@ limitations under the License. namespace oneflow { namespace user_op { -IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Add); -IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Update); +template class Opt> +__global__ void DoCUDADimScatter(const DimOpIndexNdHelper src_nd_helper, + const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, + const int64_t upper_bound, const IDX_T* index, const IN_T* src, + IN_T* output) { + DoDimScatter(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src, output); +} -INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Add); -INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Update); +template class Opt> +struct DimScatterFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimScatter), ctx, BlocksNum4ThreadsNum(elem_cnt), + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src, output); + } +}; + +template class Opt> +struct DimScatterFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const float16* src, float16* output) { + RUN_CUDA_KERNEL((DoCUDADimScatter), ctx, BlocksNum4ThreadsNum(elem_cnt), + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, reinterpret_cast(src), + reinterpret_cast(output)); + } +}; + +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; + +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; +template struct DimScatterFunctor; + +// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Update); +// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Update); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 7f9010af6ab..db71de0e56e 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -44,8 +44,33 @@ namespace oneflow { namespace user_op { -DECLARE_DIMSCATTER_FUNCTOR(Add); -DECLARE_DIMSCATTER_FUNCTOR(Update); +// DECLARE_DIMSCATTER_FUNCTOR(Update); + +template class Opt> +OF_DEVICE_FUNC void DoDimScatter(const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output) { + XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { + IDX_T coordinate[kDimGatherMaxDimCount] = {0}; + idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk + IDX_T idx_elem = index[idx_offset]; + if (idx_elem >= upper_bound) { +#if __CUDA_ARCH__ + __trap(); +#else + std::cout << "The index element " << idx_elem << " is out of bounds for dimension " << dim + << " with size " << upper_bound << std::endl; + throw Error::CheckFailedError(); +#endif + } + IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); + coordinate[dim] = idx_elem; + IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); + Opt::apply(src + src_offset, output + output_offset); + } +} template OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, @@ -74,6 +99,30 @@ OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_he } } +#define INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(devicetype, dtype, itype, opt) \ + template struct DimScatterFunctor>; + +#define INSTANTIATE_DIM_SCATTER_GPUFUNCTORS_TEST(opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, int32_t, int32_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float, int32_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, double, int32_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float16, int32_t, opt) \ + \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, int32_t, int64_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float, int64_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, double, int64_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float16, int64_t, opt) + +#define INSTANTIATE_DIM_SCATTER_CPUFUNCTORS_TEST(opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, int32_t, int32_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, float, int32_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, double, int32_t, opt) \ + \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, int32_t, int64_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, float, int64_t, opt) \ + INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, double, int64_t, opt) + +//------------upper:new, below: old------------ #define INSTANTIATE_DIM_SCATTER_FUNCTOR(devicetype, dtype, itype, binop) \ template struct DimScatter##binop##Functor; diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 34360c0eca2..8394dded3bf 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -20,7 +20,7 @@ limitations under the License. namespace oneflow { namespace user_op { -template +template class Opt> class DimScatterAddKernel final : public user_op::OpKernel { public: DimScatterAddKernel() = default; @@ -71,7 +71,7 @@ class DimScatterAddKernel final : public user_op::OpKernel { upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound } - DimScatterAddFunctor()( + DimScatterFunctor()( ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); } @@ -80,7 +80,7 @@ class DimScatterAddKernel final : public user_op::OpKernel { #define REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -103,7 +103,7 @@ REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); #define REGISTER_DIM_SCATTER_ADD_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -124,67 +124,67 @@ REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int64_t); REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int64_t); #endif // WITH_CUDA -template -class DimScatterUpdateKernel final : public user_op::OpKernel { - public: - DimScatterUpdateKernel() = default; - ~DimScatterUpdateKernel() override = default; - - private: - void Compute(KernelComputeContext* ctx) const override { - const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); - const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); - Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); - const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); - const int32_t dim = ctx->Attr("dim"); - - const IDX_T* index = index_tensor->dptr(); - IN_T* output = out_tensor->mut_dptr(); - size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - - Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - const IN_T* src = src_tensor->dptr(); - - if (input_tensor) { - Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); - } else if (like_tensor) { - Memset(ctx->device_ctx(), output, 0, out_bytes_size); - } else { - std::cout << "Unimplemented Error" << std::endl; - throw Error::Unimplemented(); - } - - const int ndim = src_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(src_tensor->shape()); - DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); - shape2dims(index_tensor->shape()); - DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); - DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - - int64_t upper_bound = 0; - if (input_tensor) { - upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound - } else { - upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound - } - - DimScatterUpdateFunctor()( - ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; +// template +// class DimScatterUpdateKernel final : public user_op::OpKernel { +// public: +// DimScatterUpdateKernel() = default; +// ~DimScatterUpdateKernel() override = default; + +// private: +// void Compute(KernelComputeContext* ctx) const override { +// const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); +// const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); +// Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); +// const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); +// const int32_t dim = ctx->Attr("dim"); + +// const IDX_T* index = index_tensor->dptr(); +// IN_T* output = out_tensor->mut_dptr(); +// size_t out_bytes_size = +// out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); + +// Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); +// const IN_T* src = src_tensor->dptr(); + +// if (input_tensor) { +// Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); +// } else if (like_tensor) { +// Memset(ctx->device_ctx(), output, 0, out_bytes_size); +// } else { +// std::cout << "Unimplemented Error" << std::endl; +// throw Error::Unimplemented(); +// } + +// const int ndim = src_tensor->shape().NumAxes(); +// fixed_vector shape_vec(ndim); +// auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { +// std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), +// [](int32_t dim) -> IDX_T { return static_cast(dim); }); +// }; +// shape2dims(src_tensor->shape()); +// DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); +// shape2dims(index_tensor->shape()); +// DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); +// shape2dims(out_tensor->shape()); +// DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); + +// int64_t upper_bound = 0; +// if (input_tensor) { +// upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound +// } else { +// upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound +// } + +// DimScatterUpdateFunctor()( +// ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, +// index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); +// } +// bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +// }; #define REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_update_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -207,7 +207,7 @@ REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); #define REGISTER_DIM_SCATTER_UPDATE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_update") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); From 1e2d724319e9e4153fe03764e6de7c8eb737f63c Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 09:25:38 +0800 Subject: [PATCH 57/82] refine --- .../user/kernels/dim_gather_scatter_util.h | 106 -------------- .../user/kernels/dim_scatter_kernel_util.cpp | 16 +-- .../user/kernels/dim_scatter_kernel_util.cu | 18 +-- .../user/kernels/dim_scatter_kernel_util.h | 134 +++++------------- oneflow/user/kernels/dim_scatter_kernels.cpp | 72 +--------- 5 files changed, 48 insertions(+), 298 deletions(-) diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h index 259735fadc9..7ce8a209cce 100644 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ b/oneflow/user/kernels/dim_gather_scatter_util.h @@ -28,115 +28,9 @@ limitations under the License. namespace oneflow { -constexpr int kDimGatherMaxDimCount = 8; - namespace user_op { -template -using DimOpIndexNdHelper = NdIndexOffsetHelper; - -template -using BinaryOpFn = void (*)(const T* x, T* y); - -template -struct DeviceBinOp { - OF_DEVICE_FUNC static void Add(const T* x, T* y) { -#ifdef __CUDA_ARCH__ - cuda::atomic::Add(y, *x); -#else - *y += *x; -#endif - } - OF_DEVICE_FUNC static void Update(const T* x, T* y) { *y = *x; } -}; - -template -struct BinOpAddFunctor { - OF_DEVICE_FUNC static void apply(const T* x, T* y) { -#ifdef __CUDA_ARCH__ - cuda::atomic::Add(y, *x); -#else - *y += *x; -#endif - } -}; - -template -struct BinOpUpdateFunctor { - OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y = *x; } -}; - -template class Opt> -struct DimScatterFunctor final { - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, - const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const IN_T* src, IN_T* output); -}; - -// ----- macros for scatter functors ----- -#define DECLARE_DIMSCATTER_FUNCTOR(binop) \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ - const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const IN_T* src, IN_T* output); \ - } - -#define IMPLEMENT_DIMSCATTER_CPUFUNCTOR(binop) \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ - const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, \ - elem_cnt, dim, upper_bound, index, src, output, \ - DeviceBinOp::binop); \ - } \ - } -#define IMPLEMENT_DIMSCATTER_GPUFUNCTOR(binop) \ - template \ - __global__ void DoCUDADimScatter##binop(const DimOpIndexNdHelper src_nd_helper, \ - const DimOpIndexNdHelper idx_nd_helper, \ - const DimOpIndexNdHelper output_nd_helper, \ - const int ndim, const int64_t elem_cnt, \ - const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const IN_T* src, IN_T* output) { \ - DoDimScatterBinOp(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, \ - dim, upper_bound, index, src, output, \ - DeviceBinOp::binop); \ - } \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ - const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const IN_T* src, IN_T* output) { \ - RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ - upper_bound, index, src, output); \ - } \ - }; \ - template \ - struct DimScatter##binop##Functor final { \ - void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, \ - const DimOpIndexNdHelper& idx_nd_helper, \ - const DimOpIndexNdHelper& output_nd_helper, const int ndim, \ - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, \ - const IDX_T* index, const float16* src, float16* output) { \ - RUN_CUDA_KERNEL((DoCUDADimScatter##binop), ctx, BlocksNum4ThreadsNum(elem_cnt), \ - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, \ - upper_bound, index, reinterpret_cast(src), \ - reinterpret_cast(output)); \ - } \ - } } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index ffe45bc275b..13adeeb617d 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -33,25 +33,13 @@ struct DimScatterFunctor final { } }; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpAddFunctor); template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpUpdateFunctor); template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; template struct DimScatterFunctor; -// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Update); -// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Update); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 45a8f135dcb..26f001f05e5 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -58,22 +58,8 @@ struct DimScatterFunctor final { } }; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; - -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; -template struct DimScatterFunctor; - -// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Update); -// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Update); +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpAddFunctor); +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpUpdateFunctor); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index db71de0e56e..e64eb479b36 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -18,33 +18,47 @@ limitations under the License. #include "oneflow/user/kernels/dim_gather_scatter_util.h" #include "oneflow/core/common/error.pb.h" -// Steps for adding a binary operation on scatter are as follows: -// 1. implment binop in DeviceBinOp, for example "Mul": -// OF_DEVICE_FUNC static void Mul(const T* x, T* y) { *y *= *x; } -// -// 2. Implement kernels in dim_scatter_kernels.cpp: -// IMPLEMENT_DIMSCATTER_KERNEL_CLASS(Mul); -// -// 3. Register kernels -// REGISTER_SCATTER_OUTPLACE_KERNEL("dim_scatter_mul_like", Mul); -// -// 4. Declare Functor in dim_scatter_kernel_util.h: -// DECLARE_DIMSCATTER_FUNCTOR(Mul); -// -// 5. Implement functors in dim_scatter_kernel_util.cu and cpp file: -// in .cu file: -// IMPLEMENT_DIMSCATTER_GPUFUNCTOR(Mul); -// INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(Mul); -// in .cpp file: -// IMPLEMENT_DIMSCATTER_CPUFUNCTOR(Mul); -// INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(Mul); -// - namespace oneflow { namespace user_op { -// DECLARE_DIMSCATTER_FUNCTOR(Update); +constexpr int kDimGatherMaxDimCount = 8; + +template +using DimOpIndexNdHelper = NdIndexOffsetHelper; + +#define INSTANTIATE_DIM_SCATTER_FUNCTORS(device_type, opt) \ + template struct DimScatterFunctor; \ + template struct DimScatterFunctor; \ + template struct DimScatterFunctor; \ + template struct DimScatterFunctor; \ + template struct DimScatterFunctor; \ + template struct DimScatterFunctor; + +template +struct BinOpAddFunctor { + OF_DEVICE_FUNC static void apply(const T* x, T* y) { +#ifdef __CUDA_ARCH__ + cuda::atomic::Add(y, *x); +#else + *y += *x; +#endif + } +}; + +template +struct BinOpUpdateFunctor { + OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y = *x; } +}; + +template class Opt> +struct DimScatterFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output); +}; template class Opt> OF_DEVICE_FUNC void DoDimScatter(const DimOpIndexNdHelper& src_nd_helper, @@ -72,80 +86,6 @@ OF_DEVICE_FUNC void DoDimScatter(const DimOpIndexNdHelper& src_nd_helper, } } -template -OF_DEVICE_FUNC void DoDimScatterBinOp(const DimOpIndexNdHelper& src_nd_helper, - const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, - const int ndim, const int64_t elem_cnt, const int32_t dim, - int64_t upper_bound, const IDX_T* index, const IN_T* src, - IN_T* output, BinaryOpFn bin_op) { - XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { - IDX_T coordinate[kDimGatherMaxDimCount] = {0}; - idx_nd_helper.OffsetToNdIndex(idx_offset, coordinate, ndim); // idx_offset -> ijk - IDX_T idx_elem = index[idx_offset]; - if (idx_elem >= upper_bound) { -#if __CUDA_ARCH__ - __trap(); -#else - std::cout << "The index element " << idx_elem << " is out of bounds for dimension " << dim - << " with size " << upper_bound << std::endl; - throw Error::CheckFailedError(); -#endif - } - IDX_T src_offset = src_nd_helper.NdIndexToOffset(coordinate, ndim); - coordinate[dim] = idx_elem; - IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - bin_op(src + src_offset, output + output_offset); - } -} - -#define INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(devicetype, dtype, itype, opt) \ - template struct DimScatterFunctor>; - -#define INSTANTIATE_DIM_SCATTER_GPUFUNCTORS_TEST(opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, int32_t, int32_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float, int32_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, double, int32_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float16, int32_t, opt) \ - \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, int32_t, int64_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float, int64_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, double, int64_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kGPU, float16, int64_t, opt) - -#define INSTANTIATE_DIM_SCATTER_CPUFUNCTORS_TEST(opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, int32_t, int32_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, float, int32_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, double, int32_t, opt) \ - \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, int32_t, int64_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, float, int64_t, opt) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR_TEST(DeviceType::kCPU, double, int64_t, opt) - -//------------upper:new, below: old------------ -#define INSTANTIATE_DIM_SCATTER_FUNCTOR(devicetype, dtype, itype, binop) \ - template struct DimScatter##binop##Functor; - -#define INSTANTIATE_DIM_SCATTER_GPUFUNCTORS(binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, int32_t, int32_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float, int32_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, double, int32_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float16, int32_t, binop) \ - \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, int32_t, int64_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float, int64_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, double, int64_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kGPU, float16, int64_t, binop) - -#define INSTANTIATE_DIM_SCATTER_CPUFUNCTORS(binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, int32_t, int32_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, float, int32_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, double, int32_t, binop) \ - \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, int32_t, int64_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, float, int64_t, binop) \ - INSTANTIATE_DIM_SCATTER_FUNCTOR(DeviceType::kCPU, double, int64_t, binop) - } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 8394dded3bf..b9d4196f433 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -21,10 +21,10 @@ namespace oneflow { namespace user_op { template class Opt> -class DimScatterAddKernel final : public user_op::OpKernel { +class DimScatterKernel final : public user_op::OpKernel { public: - DimScatterAddKernel() = default; - ~DimScatterAddKernel() override = default; + DimScatterKernel() = default; + ~DimScatterKernel() override = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -80,7 +80,7 @@ class DimScatterAddKernel final : public user_op::OpKernel { #define REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -103,7 +103,7 @@ REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); #define REGISTER_DIM_SCATTER_ADD_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_add") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -124,67 +124,9 @@ REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int64_t); REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int64_t); #endif // WITH_CUDA -// template -// class DimScatterUpdateKernel final : public user_op::OpKernel { -// public: -// DimScatterUpdateKernel() = default; -// ~DimScatterUpdateKernel() override = default; - -// private: -// void Compute(KernelComputeContext* ctx) const override { -// const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); -// const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); -// Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); -// const Tensor* src_tensor = ctx->Tensor4ArgNameAndIndex("src", 0); -// const int32_t dim = ctx->Attr("dim"); - -// const IDX_T* index = index_tensor->dptr(); -// IN_T* output = out_tensor->mut_dptr(); -// size_t out_bytes_size = -// out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - -// Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); -// const IN_T* src = src_tensor->dptr(); - -// if (input_tensor) { -// Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); -// } else if (like_tensor) { -// Memset(ctx->device_ctx(), output, 0, out_bytes_size); -// } else { -// std::cout << "Unimplemented Error" << std::endl; -// throw Error::Unimplemented(); -// } - -// const int ndim = src_tensor->shape().NumAxes(); -// fixed_vector shape_vec(ndim); -// auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { -// std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), -// [](int32_t dim) -> IDX_T { return static_cast(dim); }); -// }; -// shape2dims(src_tensor->shape()); -// DimOpIndexNdHelper src_nd_helper(shape_vec.data(), ndim); -// shape2dims(index_tensor->shape()); -// DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); -// shape2dims(out_tensor->shape()); -// DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - -// int64_t upper_bound = 0; -// if (input_tensor) { -// upper_bound = input_tensor->shape().At(dim); // ensure the idx is smaller than upperbound -// } else { -// upper_bound = like_tensor->shape().At(dim); // ensure the idx is smaller than upperbound -// } - -// DimScatterUpdateFunctor()( -// ctx->device_ctx(), src_nd_helper, idx_nd_helper, output_nd_helper, ndim, -// index_tensor->shape().elem_cnt(), dim, upper_bound, index, src, output); -// } -// bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -// }; - #define REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_update_like") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); @@ -207,7 +149,7 @@ REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); #define REGISTER_DIM_SCATTER_UPDATE_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_update") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); From e569e4003efda620cec77dc088a2e9a1c351f061 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 09:49:38 +0800 Subject: [PATCH 58/82] remove dim_gather_scatter_uitl.h --- .../user/kernels/dim_gather_scatter_util.h | 38 ------------------- .../user/kernels/dim_scatter_kernel_util.cpp | 1 - .../user/kernels/dim_scatter_kernel_util.h | 10 ++++- oneflow/user/ops/dim_scatter_ops.cpp | 2 +- 4 files changed, 10 insertions(+), 41 deletions(-) delete mode 100644 oneflow/user/kernels/dim_gather_scatter_util.h diff --git a/oneflow/user/kernels/dim_gather_scatter_util.h b/oneflow/user/kernels/dim_gather_scatter_util.h deleted file mode 100644 index 7ce8a209cce..00000000000 --- a/oneflow/user/kernels/dim_gather_scatter_util.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ -#define ONEFLOW_USER_KERNELS_DIM_GAHTER_SCATTER__UTIL_H_ - -#ifdef WITH_CUDA -#include "oneflow/core/cuda/atomic.cuh" -#endif // WITH_CUDA - -#include "oneflow/core/ndarray/xpu_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/common/shape_view.h" - -namespace oneflow { - -namespace user_op { - - - -} // namespace user_op -} // namespace oneflow - -#endif diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index 13adeeb617d..ae8cbfd65ce 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/dim_gather_scatter_util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index e64eb479b36..89ac22ea4f3 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -15,7 +15,15 @@ limitations under the License. */ #ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ -#include "oneflow/user/kernels/dim_gather_scatter_util.h" +#ifdef WITH_CUDA +#include "oneflow/core/cuda/atomic.cuh" +#endif // WITH_CUDA + +#include "oneflow/core/ndarray/xpu_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/shape_view.h" #include "oneflow/core/common/error.pb.h" namespace oneflow { diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 36cabc906ad..bcc1842bc69 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -16,7 +16,7 @@ limitations under the License. #include "oneflow/core/common/error.h" #include "oneflow/core/common/maybe.h" #include "oneflow/core/framework/user_op_registry.h" -#include "oneflow/user/kernels/dim_gather_scatter_util.h" +#include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { From 2bcf541d7a7585fbbb24a58752471d4781c8886e Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 09:51:45 +0800 Subject: [PATCH 59/82] add blank line --- oneflow/core/functional/functional_api.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 578bcf0d82a..58fbf2cbb68 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -617,4 +617,3 @@ - name: "dim_scatter_add" signature: "Tensor DimScatterAdd(Tensor input, Tensor index, Tensor src, *, Int32 dim)" bind_python: True - \ No newline at end of file From 919eb6bdc994c7c0f935a578259c89277ca07b84 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 10:14:36 +0800 Subject: [PATCH 60/82] refine macros for registering kerenls --- .../user/kernels/dim_scatter_kernel_util.h | 2 +- oneflow/user/kernels/dim_scatter_kernels.cpp | 126 +++++++----------- 2 files changed, 47 insertions(+), 81 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 89ac22ea4f3..34ff5cd31d3 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -97,4 +97,4 @@ OF_DEVICE_FUNC void DoDimScatter(const DimOpIndexNdHelper& src_nd_helper, } // namespace user_op } // namespace oneflow -#endif // ONEFLOW_USER_KERNELS_DIM_GATHER_KERNEL_UTIL_H_ +#endif // ONEFLOW_USER_KERNELS_DIM_SCATTER_KERNEL_UTIL_H_ diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index b9d4196f433..4238a5622a0 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -78,96 +78,62 @@ class DimScatterKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_add_like") \ - .SetCreateFn>() \ +#define REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, device, dtype, itype, opt) \ + REGISTER_USER_KERNEL(op_type) \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("like", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, double, int64_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kCPU, int32_t, int64_t); - -#ifdef WITH_CUDA -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, double, int64_t); -REGISTER_DIM_SCATTER_ADD_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); -#endif // WITH_CUDA - -#define REGISTER_DIM_SCATTER_ADD_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_add") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, double, int64_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kCPU, int32_t, int64_t); - -#ifdef WITH_CUDA -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, double, int64_t); -REGISTER_DIM_SCATTER_ADD_KERNEL(DeviceType::kGPU, int32_t, int64_t); -#endif // WITH_CUDA - -#define REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_update_like") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("like", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, double, int64_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kCPU, int32_t, int64_t); - -#ifdef WITH_CUDA -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, double, int64_t); -REGISTER_DIM_SCATTER_UPDATE_LIKE_KERNEL(DeviceType::kGPU, int32_t, int64_t); -#endif // WITH_CUDA - -#define REGISTER_DIM_SCATTER_UPDATE_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_update") \ - .SetCreateFn>() \ +#define REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS(op_type, opt) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, float, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, double, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, int32_t, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, float, int64_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, double, int64_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kCPU, int32_t, int64_t, opt); + +#define REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS(op_type, opt) \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, float, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, double, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, int32_t, int32_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, float, int64_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, double, int64_t, opt); \ + REGISTER_DIM_SCATTER_LIKE_KERNEL(op_type, DeviceType::kGPU, int32_t, int64_t, opt); + +#define REGISTER_DIM_SCATTER_KERNEL(op_type, device, dtype, itype, opt) \ + REGISTER_USER_KERNEL(op_type) \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, double, int64_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kCPU, int32_t, int64_t); +#define REGISTER_DIM_SCATTER_CPU_KERNELS(op_type, opt) \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, float, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, double, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, int32_t, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, float, int64_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, double, int64_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kCPU, int32_t, int64_t, opt); + +#define REGISTER_DIM_SCATTER_GPU_KERNELS(op_type, opt) \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, float, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, double, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, int32_t, int32_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, float, int64_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, double, int64_t, opt); \ + REGISTER_DIM_SCATTER_KERNEL(op_type, DeviceType::kGPU, int32_t, int64_t, opt); + +REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); +REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_add", BinOpAddFunctor); +REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); +REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); #ifdef WITH_CUDA -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, int32_t, int32_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, double, int64_t); -REGISTER_DIM_SCATTER_UPDATE_KERNEL(DeviceType::kGPU, int32_t, int64_t); +REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); +REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_add", BinOpAddFunctor); +REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); +REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); #endif // WITH_CUDA } // namespace user_op From 3a39636682b1dfd234f0279e658a5a1251b30912 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 11:29:56 +0800 Subject: [PATCH 61/82] refine dim_scatter_scalar files name --- oneflow/user/kernels/dim_scatter_scalar.cu | 104 ------------------ .../dim_scatter_scalar_kernel_util.cpp | 38 +++++++ .../kernels/dim_scatter_scalar_kernel_util.cu | 52 +++++++++ ...lar.h => dim_scatter_scalar_kernel_util.h} | 26 ++++- ...lar.cpp => dim_scatter_scalar_kernels.cpp} | 35 +++--- 5 files changed, 133 insertions(+), 122 deletions(-) delete mode 100644 oneflow/user/kernels/dim_scatter_scalar.cu create mode 100644 oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp create mode 100644 oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu rename oneflow/user/kernels/{dim_scatter_scalar.h => dim_scatter_scalar_kernel_util.h} (56%) rename oneflow/user/kernels/{dim_scatter_scalar.cpp => dim_scatter_scalar_kernels.cpp} (69%) diff --git a/oneflow/user/kernels/dim_scatter_scalar.cu b/oneflow/user/kernels/dim_scatter_scalar.cu deleted file mode 100644 index 82edc6c8939..00000000000 --- a/oneflow/user/kernels/dim_scatter_scalar.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_CUDA -#include "oneflow/user/kernels/dim_scatter_scalar.h" - -namespace oneflow { - -namespace user_op { - -namespace { - -template -__global__ void DoCUDADimScatterScalarUpdate(const DimOpIndexNdHelper idx_nd_helper, - const DimOpIndexNdHelper output_nd_helper, - const int ndim, const int64_t elem_cnt, - const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const IN_T src_scalar, - IN_T* output) { - ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, src_scalar, output); -} -} // namespace - -template -class GpuDimScatterScalarUpdateKernel final : public OpKernel { - public: - GpuDimScatterScalarUpdateKernel() = default; - ~GpuDimScatterScalarUpdateKernel() = default; - - private: - void Compute(KernelComputeContext* ctx) const override { - const Tensor* input_tensor = ctx->Tensor4ArgNameAndIndex("input", 0); - const Tensor* index_tensor = ctx->Tensor4ArgNameAndIndex("index", 0); - Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("output", 0); - const int32_t dim = ctx->Attr("dim"); - - const IDX_T* index = index_tensor->dptr(); - IN_T* output = out_tensor->mut_dptr(); - size_t out_bytes_size = - out_tensor->shape().elem_cnt() * GetSizeOfDataType(out_tensor->data_type()); - - Tensor* like_tensor = ctx->Tensor4ArgNameAndIndex("like", 0); - const IN_T src_scalar = static_cast(ctx->Attr("src_scalar")); - - if (input_tensor) { - Memcpy(ctx->device_ctx(), output, input_tensor->dptr(), out_bytes_size); - } else if (like_tensor) { - Memset(ctx->device_ctx(), output, 0, out_bytes_size); - } else { - std::cout << "Unimplemented Error" << std::endl; - throw Error::Unimplemented(); - } - - const int ndim = out_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); - auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { - std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), - [](int32_t dim) -> IDX_T { return static_cast(dim); }); - }; - shape2dims(index_tensor->shape()); - DimOpIndexNdHelper idx_nd_helper(shape_vec.data(), ndim); - shape2dims(out_tensor->shape()); - DimOpIndexNdHelper output_nd_helper(shape_vec.data(), ndim); - - int64_t upper_bound = input_tensor->shape().At(dim); - int64_t elem_cnt = index_tensor->shape().elem_cnt(); - - RUN_CUDA_KERNEL((DoCUDADimScatterScalarUpdate), ctx->device_ctx(), - BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, - dim, upper_bound, index, src_scalar, output); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_GPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ - & (user_op::HobDataType("input", 0) == GetDataType::value) \ - & (user_op::HobDataType("index", 0) == GetDataType::value)); - -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int64_t); -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_GPU_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); - -} // namespace user_op -} // namespace oneflow -#endif diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp new file mode 100644 index 00000000000..fc371739340 --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp @@ -0,0 +1,38 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template +struct DimScatterUpdateScalarFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { + DoScatterUpdateScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src, output); + } +}; + +INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(DeviceType::kCPU); +template struct DimScatterUpdateScalarFunctor; +template struct DimScatterUpdateScalarFunctor; + +} // namespace user_op +} // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu new file mode 100644 index 00000000000..b0900ec1dec --- /dev/null +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu @@ -0,0 +1,52 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_CUDA +#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template +__global__ void DoCUDADimScatterUpdateScalar(const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, + const int ndim, const int64_t elem_cnt, + const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T src_scalar, + IN_T* output) { + DoScatterUpdateScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src_scalar, output); +} + +template +struct DimScatterUpdateScalarFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimScatterUpdateScalar), ctx, + BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src, output); + } +}; + +INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(DeviceType::kGPU); +template struct DimScatterUpdateScalarFunctor; +template struct DimScatterUpdateScalarFunctor; + +} // namespace user_op +} // namespace oneflow +#endif diff --git a/oneflow/user/kernels/dim_scatter_scalar.h b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h similarity index 56% rename from oneflow/user/kernels/dim_scatter_scalar.h rename to oneflow/user/kernels/dim_scatter_scalar_kernel_util.h index 79c00e092d3..48e2760f3fb 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.h +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h @@ -27,15 +27,31 @@ namespace user_op { constexpr int kDimGatherMaxDimCount = 8; +#define INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(device_type) \ + template struct DimScatterUpdateScalarFunctor; \ + template struct DimScatterUpdateScalarFunctor; \ + template struct DimScatterUpdateScalarFunctor; \ + template struct DimScatterUpdateScalarFunctor; \ + template struct DimScatterUpdateScalarFunctor; \ + template struct DimScatterUpdateScalarFunctor; + template using DimOpIndexNdHelper = NdIndexOffsetHelper; +template +struct DimScatterUpdateScalarFunctor final { + void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output); +}; + template -OF_DEVICE_FUNC void ScatterScalarUpdateFunctor(const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, - const int ndim, const int64_t elem_cnt, - const int32_t dim, int64_t upper_bound, - const IDX_T* index, const IN_T src, IN_T* output) { +OF_DEVICE_FUNC void DoScatterUpdateScalarFunctor(const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, + const int ndim, const int64_t elem_cnt, + const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; diff --git a/oneflow/user/kernels/dim_scatter_scalar.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp similarity index 69% rename from oneflow/user/kernels/dim_scatter_scalar.cpp rename to oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index 3bbe8b8d848..1a53081f962 100644 --- a/oneflow/user/kernels/dim_scatter_scalar.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -13,17 +13,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "oneflow/user/kernels/dim_scatter_scalar.h" +#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h" namespace oneflow { namespace user_op { template -class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { +class DimScatterUpdateScalarKernel final : public user_op::OpKernel { public: - CpuDimScatterScalarUpdateKernel() = default; - ~CpuDimScatterScalarUpdateKernel() = default; + DimScatterUpdateScalarKernel() = default; + ~DimScatterUpdateScalarKernel() = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -62,25 +62,34 @@ class CpuDimScatterScalarUpdateKernel final : public user_op::OpKernel { int64_t upper_bound = input_tensor->shape().At(dim); - ScatterScalarUpdateFunctor(idx_nd_helper, output_nd_helper, ndim, - index_tensor->shape().elem_cnt(), dim, upper_bound, - index, src_scalar, output); + DimScatterUpdateScalarFunctor()( + ctx->device_ctx(), idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), + dim, upper_bound, index, src_scalar, output); } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_CPU_SCATTERSCALAR_KERNEL(device, dtype, itype) \ +#define REGISTER_SCATTERSCALAR_KERNEL(device, dtype, itype) \ REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ - .SetCreateFn>() \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_CPU_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int64_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int64_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int32_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int64_t); + +#ifdef WITH_CUDA +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int32_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int64_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); +REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); +#endif // WITH_CUDA } // namespace user_op } // namespace oneflow From 77d2c9ec5a76c15fc9a0a440f26668428bdd04c8 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 11:34:50 +0800 Subject: [PATCH 62/82] refine --- oneflow/user/kernels/dim_scatter_scalar_kernel_util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h index 48e2760f3fb..acf718aace1 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h @@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ -#define ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ +#ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_KERNEL_UTIL_H_ +#define ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_KERNEL_UTIL_H_ #include "oneflow/core/device/device_context.h" #include "oneflow/core/ndarray/xpu_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" @@ -75,4 +75,4 @@ OF_DEVICE_FUNC void DoScatterUpdateScalarFunctor(const DimOpIndexNdHelper } // namespace user_op } // namespace oneflow -#endif // ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_H_ +#endif // ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_KERNEL_UTIL_H_ From 4ee8b5486892e8cb8d04af55e99b0b1b4a4d6722 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 11:50:54 +0800 Subject: [PATCH 63/82] refine register ops --- oneflow/user/ops/dim_scatter_ops.cpp | 89 +++++++++++++--------------- 1 file changed, 42 insertions(+), 47 deletions(-) diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index bcc1842bc69..82b4565d95b 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -177,6 +177,46 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { return Maybe::Ok(); } +void ScatterBackward(user_op::BackwardOpConfContext* ctx) { + const TensorDesc& src = ctx->FwOp().TensorDesc4ArgNameAndIndex("src", 0); + const TensorDesc& index = ctx->FwOp().TensorDesc4ArgNameAndIndex("index", 0); + const int64_t ndim = src.shape().NumAxes(); + + FOR_RANGE(int64_t, i, 0, ndim) { + if (index.shape().At(i) != src.shape().At(i)) { + UNIMPLEMENTED() << "The backward pass is implemented only for src.shape == index.shape.\n"; + } + } + + const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; + ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { + return builder.OpTypeName("dim_gather") + .InputBind("index", ctx->FwOp().input("index", 0)) + .InputBind("input", ctx->FwOp().output_grad("output", 0)) + .Output("output") + .Attr("dim", ctx->FwOp().attr("dim")) + .Build(); + }); + ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), + [&ctx, &op_src_grad_name]() -> const std::string& { + return ctx->GetOp(op_src_grad_name).output("output", 0); + }); + const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; + ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { + return builder.OpTypeName("dim_scatter_scalar_update") + .InputBind("index", ctx->FwOp().input("index", 0)) + .InputBind("input", ctx->FwOp().output_grad("output", 0)) + .Output("output") + .Attr("dim", ctx->FwOp().attr("dim")) + .Attr("src_scalar", static_cast(0.0)) + .Build(); + }); + ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), + [&ctx, &op_input_grad_name]() -> const std::string& { + return ctx->GetOp(op_input_grad_name).output("output", 0); + }); +} + } // namespace #define REGISTER_SCATTER_LIKE_OP(optypename) \ @@ -215,53 +255,8 @@ Maybe InferScalarDtype(user_op::InferContext* ctx) { .SetDataTypeInferFn(InferScalarDtype) \ .SetGetSbpFn(SetSbpScatter) -#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ - REGISTER_USER_OP_GRAD(optypename) \ - .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ - const TensorDesc& src = ctx->FwOp().TensorDesc4ArgNameAndIndex("src", 0); \ - const TensorDesc& index = ctx->FwOp().TensorDesc4ArgNameAndIndex("index", 0); \ - const int64_t ndim = src.shape().NumAxes(); \ - bool backprop_flag = true; \ - FOR_RANGE(int64_t, i, 0, ndim) { \ - if (index.shape().At(i) != src.shape().At(i)) { \ - backprop_flag = false; \ - break; \ - } \ - } \ - if (backprop_flag) { \ - const auto op_src_grad_name = ctx->FwOp().op_name() + "_src_grad"; \ - ctx->DefineOp(op_src_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_gather") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("src", 0), \ - [&ctx, &op_src_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_src_grad_name).output("output", 0); \ - }); \ - const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ - ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_scatter_scalar_update") \ - .InputBind("index", ctx->FwOp().input("index", 0)) \ - .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ - .Output("output") \ - .Attr("dim", ctx->FwOp().attr("dim")) \ - .Attr("src_scalar", static_cast(0.0)) \ - .Build(); \ - }); \ - ctx->FwOp().InputGradBind(user_op::OpArg("input", 0), \ - [&ctx, &op_input_grad_name]() -> const std::string& { \ - return ctx->GetOp(op_input_grad_name).output("output", 0); \ - }); \ - } else { \ - std::cout << "The backward pass is implemented only for src.shape == index.shape." \ - << std::endl; \ - throw Error::Unimplemented(); \ - } \ - }); +#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ + REGISTER_USER_OP_GRAD(optypename).SetBackwardOpConfGenFn(ScatterBackward); #define REGISTER_USER_OP_GRAD_SCATTER_SCALAR(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ From bb2d66f47a270a226f373eb2f03b1fe43be396f8 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 12:01:04 +0800 Subject: [PATCH 64/82] refine --- oneflow/user/kernels/dim_scatter_kernel_util.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 26f001f05e5..151a329b300 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef WITH_CUDA -// #include "oneflow/core/kernel/util/cuda_kernel_util.h" #include "oneflow/user/kernels/dim_scatter_kernel_util.h" namespace oneflow { From d873ef005ed6b8323fac2f29389103ce39d52466 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 12:37:57 +0800 Subject: [PATCH 65/82] add F.dim_scatter_scalar --- oneflow/core/functional/functional_api.yaml | 5 ++++ .../core/functional/impl/array_functor.cpp | 23 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index c14fa45fb0d..adfc9711f59 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -700,6 +700,11 @@ - name: "dim_scatter_add" signature: "Tensor DimScatterAdd(Tensor input, Tensor index, Tensor src, *, Int32 dim)" + bind_python: True + +- name: "dim_scatter_scalar" + signature: "Tensor DimScatterScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" + bind_python: True - name: "tensor_setitem" signature: "Void TensorSetItem(Tensor x, *, TensorIndex index, Tensor value)" diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index 46c626579ba..d8173a4cb00 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -306,6 +306,28 @@ class DimScatterAddFunctor { std::shared_ptr op_; }; +class DimScatterScalarFunctor { + public: + DimScatterScalarFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_scalar_update") + .Input("input") + .Input("index") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, const float& src, + const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + JUST(attrs.SetAttr("src_scalar", src)); + return OpInterpUtil::Dispatch(*op_, {input, index}, attrs); + } + + private: + std::shared_ptr op_; +}; + class GatherNdFunctor { public: GatherNdFunctor() { @@ -989,6 +1011,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("TensorGetItem"); m.add_functor("DimScatter"); m.add_functor("DimScatterAdd"); + m.add_functor("DimScatterScalar"); m.add_functor("TensorSetItem"); }; From 33c536195fb4f0bb8dd0aca4b1e7164be75f68d6 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 13:42:34 +0800 Subject: [PATCH 66/82] add scatter op --- oneflow/python/nn/modules/scatter.py | 90 ++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 oneflow/python/nn/modules/scatter.py diff --git a/oneflow/python/nn/modules/scatter.py b/oneflow/python/nn/modules/scatter.py new file mode 100644 index 00000000000..fe3491fa263 --- /dev/null +++ b/oneflow/python/nn/modules/scatter.py @@ -0,0 +1,90 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import oneflow as flow + +from oneflow.python.framework.tensor import Tensor +from oneflow.python.oneflow_export import oneflow_export, experimental_api +from oneflow.python.framework.tensor import register_tensor_op +from oneflow.python.nn.module import Module + +from typing import Optional, List, Tuple + +class Scatter(Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, input, dim, index, src, reduce): + assert type(src) in [flow.Tensor, float], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + if isinstance(src, flow.Tensor): + return flow.F.dim_scatter(input, index, src, dim) + elif isinstance(src, float): + return flow.F.dim_scatter_scalar(input, index, src, dim) + +@oneflow_export("scatter") +@experimental_api +def scatter_op(input, dim, index, src, reduce:Optional[str]=None): + r"""This operator writes the elements specified by `index` along with the axis + `dim` from the `src` into the `input`. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + input[index[i][j][k]][j][k] = src[i][j][k] # if dim == 0 + input[i][index[i][j][k]][k] = src[i][j][k] # if dim == 1 + input[i][j][index[i][j][k]] = src[i][j][k] # if dim == 2 + + input, index and src (if it is a Tensor) should all have the same number of dimensions. + It is also required that index.shape(d) <= src.shape(d) for all dimensions d, + and that index.shape(d) <= self.shape(d) for all dimensions d != dim. + Note that index and src do not broadcast. + + Args: + input (Tensor): The input blob. + dim (int): The axis along which to index + index (Tensor): The index blob of elements to scatter. + src (Tensor or float): The source blob whose elements will be scatterd and updated to output. + + Returns: + Tensor: The scatterd Tensor. + + For example: + + .. code-block:: python + + >>> import oneflow.experimental as flow + >>> import numpy as np + + >>> input = flow.ones((3,5)) + >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) + >>> src = flow.Tensor(np.random.rand(2,5)) + >>> out = flow.scatter(input, 1, index, src) + >>> out.shape + flow.Size([3, 5]) + >>> out = flow.scatter(input, 1, index, 3.14) + >>> out + tensor([[3.14, 3.14, 3.14, 1. , 1. ], + [3.14, 3.14, 1. , 1. , 3.14], + [1. , 1. , 1. , 1. , 1. ]], dtype=oneflow.float32) + + """ + + return Scatter()(input, dim, index, src, reduce) + +if __name__ == "__main__": + import doctest + + doctest.testmod(raise_on_error=True) From b42c79571c376c9dc900ac24e483629be529a663 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 14:00:55 +0800 Subject: [PATCH 67/82] refine docstr --- oneflow/python/nn/modules/scatter.py | 31 +++++++++++++++++++++++----- oneflow/python/ops/array_ops.py | 6 +++--- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/oneflow/python/nn/modules/scatter.py b/oneflow/python/nn/modules/scatter.py index fe3491fa263..d8857d41b8c 100644 --- a/oneflow/python/nn/modules/scatter.py +++ b/oneflow/python/nn/modules/scatter.py @@ -22,20 +22,27 @@ from typing import Optional, List, Tuple + class Scatter(Module): def __init__(self) -> None: super().__init__() def forward(self, input, dim, index, src, reduce): - assert type(src) in [flow.Tensor, float], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + assert type(src) in [ + flow.Tensor, + float, + ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + assert reduce is None, "reduce not implemented yet" + if isinstance(src, flow.Tensor): return flow.F.dim_scatter(input, index, src, dim) elif isinstance(src, float): return flow.F.dim_scatter_scalar(input, index, src, dim) + @oneflow_export("scatter") @experimental_api -def scatter_op(input, dim, index, src, reduce:Optional[str]=None): +def scatter_op(input, dim, index, src, reduce: Optional[str] = None): r"""This operator writes the elements specified by `index` along with the axis `dim` from the `src` into the `input`. @@ -70,10 +77,12 @@ def scatter_op(input, dim, index, src, reduce:Optional[str]=None): >>> input = flow.ones((3,5)) >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) - >>> src = flow.Tensor(np.random.rand(2,5)) + >>> src = flow.Tensor(np.array([[0,10,20,30,40],[50,60,70,80,90]])) >>> out = flow.scatter(input, 1, index, src) - >>> out.shape - flow.Size([3, 5]) + >>> out + tensor([[ 0., 10., 20., 1., 1.], + [50., 60., 1., 1., 70.], + [ 1., 1., 1., 1., 1.]], dtype=oneflow.float32) >>> out = flow.scatter(input, 1, index, 3.14) >>> out tensor([[3.14, 3.14, 3.14, 1. , 1. ], @@ -84,6 +93,18 @@ def scatter_op(input, dim, index, src, reduce:Optional[str]=None): return Scatter()(input, dim, index, src, reduce) + +@register_tensor_op +@experimental_api +def scatter_tensor_op(input, dim, index, src, reduce: Optional[str] = None): + r""" + In-place version of :func:`oneflow.experimental.scatter` + + """ + + return Scatter()(input, dim, index, src, reduce) + + if __name__ == "__main__": import doctest diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 89dcc98102c..0f768cb0b4e 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2691,9 +2691,9 @@ def dim_scatter_update( .. code-block:: python - output[index[i][j][k]][j][k] = input[i][j][k] # if dim == 0 - output[i][index[i][j][k]][k] = input[i][j][k] # if dim == 1 - output[i][j][index[i][j][k]] = input[i][j][k] # if dim == 2 + output[index[i][j][k]][j][k] = src[i][j][k] # if dim == 0 + output[i][index[i][j][k]][k] = src[i][j][k] # if dim == 1 + output[i][j][index[i][j][k]] = src[i][j][k] # if dim == 2 input, index and src (if it is a Tensor) should all have the same number of dimensions. It is also required that index.shape(d) <= src.shape(d) for all dimensions d, From 5667dd4f064c2301a9fd35d5aececeb432f58c11 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 16:20:25 +0800 Subject: [PATCH 68/82] add scatter reduce arg --- oneflow/core/functional/functional_api.yaml | 10 +++- .../core/functional/impl/array_functor.cpp | 54 ++++++++++++++++-- oneflow/python/nn/modules/scatter.py | 37 +++++++++--- oneflow/python/ops/array_ops.py | 2 +- .../dim_scatter_scalar_kernel_util.cpp | 22 +++++--- .../kernels/dim_scatter_scalar_kernel_util.cu | 33 ++++++----- .../kernels/dim_scatter_scalar_kernel_util.h | 56 +++++++++++++------ .../kernels/dim_scatter_scalar_kernels.cpp | 44 +++++++++------ oneflow/user/ops/dim_scatter_ops.cpp | 21 ++++--- 9 files changed, 197 insertions(+), 82 deletions(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index adfc9711f59..2070cb1411c 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -703,7 +703,15 @@ bind_python: True - name: "dim_scatter_scalar" - signature: "Tensor DimScatterScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" + signature: "Tensor DimScatterUpdateScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" + bind_python: True + +- name: "dim_scatter_add_scalar" + signature: "Tensor DimScatterAddScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" + bind_python: True + +- name: "dim_scatter_mul_scalar" + signature: "Tensor DimScatterMulScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" bind_python: True - name: "tensor_setitem" diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index d8173a4cb00..7dcc7eb1208 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -306,10 +306,54 @@ class DimScatterAddFunctor { std::shared_ptr op_; }; -class DimScatterScalarFunctor { +class DimScatterUpdateScalarFunctor { public: - DimScatterScalarFunctor() { - op_ = CHECK_JUST(one::OpBuilder("dim_scatter_scalar_update") + DimScatterUpdateScalarFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_update_scalar") + .Input("input") + .Input("index") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, const float& src, + const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + JUST(attrs.SetAttr("src_scalar", src)); + return OpInterpUtil::Dispatch(*op_, {input, index}, attrs); + } + + private: + std::shared_ptr op_; +}; + +class DimScatterAddScalarFunctor { + public: + DimScatterAddScalarFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_add_scalar") + .Input("input") + .Input("index") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, const float& src, + const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + JUST(attrs.SetAttr("src_scalar", src)); + return OpInterpUtil::Dispatch(*op_, {input, index}, attrs); + } + + private: + std::shared_ptr op_; +}; + +class DimScatterMulScalarFunctor { + public: + DimScatterMulScalarFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_mul_scalar") .Input("input") .Input("index") .Output("output") @@ -1011,7 +1055,9 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("TensorGetItem"); m.add_functor("DimScatter"); m.add_functor("DimScatterAdd"); - m.add_functor("DimScatterScalar"); + m.add_functor("DimScatterUpdateScalar"); + m.add_functor("DimScatterAddScalar"); + m.add_functor("DimScatterMulScalar"); m.add_functor("TensorSetItem"); }; diff --git a/oneflow/python/nn/modules/scatter.py b/oneflow/python/nn/modules/scatter.py index d8857d41b8c..fe56b81976d 100644 --- a/oneflow/python/nn/modules/scatter.py +++ b/oneflow/python/nn/modules/scatter.py @@ -32,11 +32,20 @@ def forward(self, input, dim, index, src, reduce): flow.Tensor, float, ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - assert reduce is None, "reduce not implemented yet" + + assert reduce in [ + "add", + "multiply", + None, + ], "reduce must be 'add', 'multiply' or None" if isinstance(src, flow.Tensor): return flow.F.dim_scatter(input, index, src, dim) elif isinstance(src, float): + if reduce == "add": + return flow.F.dim_scatter_add_scalar(input, index, src, dim) + elif reduce == "multiply": + return flow.F.dim_scatter_mul_scalar(input, index, src, dim) return flow.F.dim_scatter_scalar(input, index, src, dim) @@ -75,20 +84,30 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): >>> import oneflow.experimental as flow >>> import numpy as np - >>> input = flow.ones((3,5)) + >>> input = flow.ones((3,5))*2 >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) >>> src = flow.Tensor(np.array([[0,10,20,30,40],[50,60,70,80,90]])) >>> out = flow.scatter(input, 1, index, src) >>> out - tensor([[ 0., 10., 20., 1., 1.], - [50., 60., 1., 1., 70.], - [ 1., 1., 1., 1., 1.]], dtype=oneflow.float32) + tensor([[ 0., 10., 20., 2., 2.], + [50., 60., 2., 2., 70.], + [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) >>> out = flow.scatter(input, 1, index, 3.14) >>> out - tensor([[3.14, 3.14, 3.14, 1. , 1. ], - [3.14, 3.14, 1. , 1. , 3.14], - [1. , 1. , 1. , 1. , 1. ]], dtype=oneflow.float32) - + tensor([[3.14, 3.14, 3.14, 2. , 2. ], + [3.14, 3.14, 2. , 2. , 3.14], + [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) + >>> out = flow.scatter(input, 1, index, 3.14, reduce="add") + >>> out + tensor([[5.14, 5.14, 5.14, 2. , 2. ], + [5.14, 5.14, 2. , 2. , 5.14], + [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) + >>> out = flow.scatter(input, 1, index, 3.14, reduce="multiply") + >>> out + tensor([[6.28, 6.28, 6.28, 2. , 2. ], + [6.28, 6.28, 2. , 2. , 6.28], + [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) + """ return Scatter()(input, dim, index, src, reduce) diff --git a/oneflow/python/ops/array_ops.py b/oneflow/python/ops/array_ops.py index 0f768cb0b4e..ea194df8051 100644 --- a/oneflow/python/ops/array_ops.py +++ b/oneflow/python/ops/array_ops.py @@ -2765,7 +2765,7 @@ def dim_scatter_update_job( if name is not None else id_util.UniqueStr("DimScatterScalarUpdate_") ) - .Op("dim_scatter_scalar_update") + .Op("dim_scatter_update_scalar") .Input("input", [input]) .Input("index", [index]) .Attr("src_scalar", float(src)) diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp index fc371739340..b1e19e727e4 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp @@ -19,20 +19,28 @@ namespace oneflow { namespace user_op { -template -struct DimScatterUpdateScalarFunctor final { +template class Opt> +struct DimScatterScalarFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T src, IN_T* output) { - DoScatterUpdateScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, src, output); + DoScatterScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src, output); } }; -INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(DeviceType::kCPU); -template struct DimScatterUpdateScalarFunctor; -template struct DimScatterUpdateScalarFunctor; +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, UpdateScalarFunctor); +template struct DimScatterScalarFunctor; +template struct DimScatterScalarFunctor; + +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, AddScalarFunctor); +template struct DimScatterScalarFunctor; +template struct DimScatterScalarFunctor; + +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, MulScalarFunctor); +template struct DimScatterScalarFunctor; +template struct DimScatterScalarFunctor; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu index b0900ec1dec..16d259e6cac 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu @@ -20,32 +20,31 @@ namespace oneflow { namespace user_op { -template -__global__ void DoCUDADimScatterUpdateScalar(const DimOpIndexNdHelper idx_nd_helper, - const DimOpIndexNdHelper output_nd_helper, - const int ndim, const int64_t elem_cnt, - const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const IN_T src_scalar, - IN_T* output) { - DoScatterUpdateScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, src_scalar, output); +template class Opt> +__global__ void DoCUDADimScatterScalar(const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, + const int ndim, const int64_t elem_cnt, const int32_t dim, + const int64_t upper_bound, const IDX_T* index, + const IN_T src_scalar, IN_T* output) { + DoScatterScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src_scalar, output); } -template -struct DimScatterUpdateScalarFunctor final { +template class Opt> +struct DimScatterScalarFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T src, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterUpdateScalar), ctx, - BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, - dim, upper_bound, index, src, output); + RUN_CUDA_KERNEL((DoCUDADimScatterScalar), ctx, BlocksNum4ThreadsNum(elem_cnt), + idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, upper_bound, index, src, + output); } }; -INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(DeviceType::kGPU); -template struct DimScatterUpdateScalarFunctor; -template struct DimScatterUpdateScalarFunctor; +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, UpdateScalarFunctor); +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, AddScalarFunctor); +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, MulScalarFunctor); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h index acf718aace1..a3656eda037 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h @@ -15,6 +15,9 @@ limitations under the License. */ #ifndef ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_KERNEL_UTIL_H_ #define ONEFLOW_USER_KERNELS_DIM_SCATTER_SCALAR_KERNEL_UTIL_H_ +#ifdef WITH_CUDA +#include "oneflow/core/cuda/atomic.cuh" +#endif // WITH_CUDA #include "oneflow/core/device/device_context.h" #include "oneflow/core/ndarray/xpu_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" @@ -27,31 +30,52 @@ namespace user_op { constexpr int kDimGatherMaxDimCount = 8; -#define INSTANTIATE_DIM_SCATTER_UPDATE_SCARLAR_FUNCTORS(device_type) \ - template struct DimScatterUpdateScalarFunctor; \ - template struct DimScatterUpdateScalarFunctor; \ - template struct DimScatterUpdateScalarFunctor; \ - template struct DimScatterUpdateScalarFunctor; \ - template struct DimScatterUpdateScalarFunctor; \ - template struct DimScatterUpdateScalarFunctor; +template +struct AddScalarFunctor { + OF_DEVICE_FUNC static void apply(const T x, T* y) { +#ifdef __CUDA_ARCH__ + cuda::atomic::Add(y, x); +#else + *y += x; +#endif + } +}; + +template +struct UpdateScalarFunctor { + OF_DEVICE_FUNC static void apply(const T x, T* y) { *y = x; } +}; + +template +struct MulScalarFunctor { + OF_DEVICE_FUNC static void apply(const T x, T* y) { *y *= x; } +}; + +#define INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(device_type, opt) \ + template struct DimScatterScalarFunctor; \ + template struct DimScatterScalarFunctor; \ + template struct DimScatterScalarFunctor; \ + template struct DimScatterScalarFunctor; \ + template struct DimScatterScalarFunctor; \ + template struct DimScatterScalarFunctor; template using DimOpIndexNdHelper = NdIndexOffsetHelper; -template -struct DimScatterUpdateScalarFunctor final { +template class Opt> +struct DimScatterScalarFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& idx_nd_helper, const DimOpIndexNdHelper& output_nd_helper, const int ndim, const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, const IDX_T* index, const IN_T src, IN_T* output); }; -template -OF_DEVICE_FUNC void DoScatterUpdateScalarFunctor(const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, - const int ndim, const int64_t elem_cnt, - const int32_t dim, int64_t upper_bound, - const IDX_T* index, const IN_T src, IN_T* output) { +template class Opt> +OF_DEVICE_FUNC void DoScatterScalarFunctor(const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, + const int ndim, const int64_t elem_cnt, + const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { XPU_1D_KERNEL_LOOP(idx_offset, elem_cnt) { IDX_T coordinate[kDimGatherMaxDimCount] = {0}; @@ -68,7 +92,7 @@ OF_DEVICE_FUNC void DoScatterUpdateScalarFunctor(const DimOpIndexNdHelper } coordinate[dim] = idx_elem; IDX_T output_offset = output_nd_helper.NdIndexToOffset(coordinate, ndim); - *(output + output_offset) = src; + Opt::apply(src, output + output_offset); } } diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index 1a53081f962..3396d2220ca 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -19,11 +19,11 @@ namespace oneflow { namespace user_op { -template -class DimScatterUpdateScalarKernel final : public user_op::OpKernel { +template class Opt> +class DimScatterScalarKernel final : public user_op::OpKernel { public: - DimScatterUpdateScalarKernel() = default; - ~DimScatterUpdateScalarKernel() = default; + DimScatterScalarKernel() = default; + ~DimScatterScalarKernel() = default; private: void Compute(KernelComputeContext* ctx) const override { @@ -62,7 +62,7 @@ class DimScatterUpdateScalarKernel final : public user_op::OpKernel { int64_t upper_bound = input_tensor->shape().At(dim); - DimScatterUpdateScalarFunctor()( + DimScatterScalarFunctor()( ctx->device_ctx(), idx_nd_helper, output_nd_helper, ndim, index_tensor->shape().elem_cnt(), dim, upper_bound, index, src_scalar, output); } @@ -70,25 +70,33 @@ class DimScatterUpdateScalarKernel final : public user_op::OpKernel { bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } }; -#define REGISTER_SCATTERSCALAR_KERNEL(device, dtype, itype) \ - REGISTER_USER_KERNEL("dim_scatter_scalar_update") \ - .SetCreateFn>() \ +#define REGISTER_SCATTERSCALAR_KERNEL(op_type_name, device, dtype, itype, opt) \ + REGISTER_USER_KERNEL(op_type_name) \ + .SetCreateFn>() \ .SetIsMatchedHob((user_op::HobDeviceTag() == device) \ & (user_op::HobDataType("input", 0) == GetDataType::value) \ & (user_op::HobDataType("index", 0) == GetDataType::value)); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int32_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, float, int64_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int32_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kCPU, double, int64_t); +#define REGISTER_SCATTER_SCALAR_CPU_KERNELS(op_type_name, opt) \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kCPU, float, int32_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kCPU, float, int64_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kCPU, double, int32_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kCPU, double, int64_t, opt); + +#define REGISTER_SCATTER_SCALAR_GPU_KERNELS(op_type_name, opt) \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kGPU, float, int32_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kGPU, float, int64_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kGPU, double, int32_t, opt); \ + REGISTER_SCATTERSCALAR_KERNEL(op_type_name, DeviceType::kGPU, double, int64_t, opt); + +REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_update_scalar", UpdateScalarFunctor); +REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_add_scalar", AddScalarFunctor); +REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_mul_scalar", MulScalarFunctor); #ifdef WITH_CUDA -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int32_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float, int64_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int32_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, float16, int64_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int32_t); -REGISTER_SCATTERSCALAR_KERNEL(DeviceType::kGPU, double, int64_t); +REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_update_scalar", UpdateScalarFunctor); +REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_add_scalar", AddScalarFunctor); +REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_mul_scalar", MulScalarFunctor); #endif // WITH_CUDA } // namespace user_op diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 82b4565d95b..00951b02930 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -203,7 +203,7 @@ void ScatterBackward(user_op::BackwardOpConfContext* ctx) { }); const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { - return builder.OpTypeName("dim_scatter_scalar_update") + return builder.OpTypeName("dim_scatter_update_scalar") .InputBind("index", ctx->FwOp().input("index", 0)) .InputBind("input", ctx->FwOp().output_grad("output", 0)) .Output("output") @@ -255,15 +255,15 @@ void ScatterBackward(user_op::BackwardOpConfContext* ctx) { .SetDataTypeInferFn(InferScalarDtype) \ .SetGetSbpFn(SetSbpScatter) -#define REGISTER_USER_OP_GRAD_SCATTER(optypename) \ +#define REGISTER_SCATTER_GRAD(optypename) \ REGISTER_USER_OP_GRAD(optypename).SetBackwardOpConfGenFn(ScatterBackward); -#define REGISTER_USER_OP_GRAD_SCATTER_SCALAR(optypename) \ +#define REGISTER_SCATTER_SCALAR_GRAD(optypename) \ REGISTER_USER_OP_GRAD(optypename) \ .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) { \ const auto op_input_grad_name = ctx->FwOp().op_name() + "_input_grad"; \ ctx->DefineOp(op_input_grad_name, [&ctx](user_op::BackwardOpBuilder& builder) { \ - return builder.OpTypeName("dim_scatter_scalar_update") \ + return builder.OpTypeName("dim_scatter_update_scalar") \ .InputBind("index", ctx->FwOp().input("index", 0)) \ .InputBind("input", ctx->FwOp().output_grad("output", 0)) \ .Output("output") \ @@ -282,12 +282,15 @@ REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_OP("dim_scatter_add"); REGISTER_SCATTER_OP("dim_scatter_update"); -REGISTER_SCATTER_SCALAR_OP("dim_scatter_scalar_update"); -REGISTER_SCATTER_SCALAR_OP("dim_scatter_scalar_add"); +REGISTER_SCATTER_SCALAR_OP("dim_scatter_update_scalar"); +REGISTER_SCATTER_SCALAR_OP("dim_scatter_add_scalar"); +REGISTER_SCATTER_SCALAR_OP("dim_scatter_mul_scalar"); -REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_add"); -REGISTER_USER_OP_GRAD_SCATTER("dim_scatter_update"); +REGISTER_SCATTER_GRAD("dim_scatter_add"); +REGISTER_SCATTER_GRAD("dim_scatter_update"); -REGISTER_USER_OP_GRAD_SCATTER_SCALAR("dim_scatter_scalar_update"); +REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_update_scalar"); +REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_add_scalar"); +REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_mul_scalar"); } // namespace user_op } // namespace oneflow From d7b9d443f5b215923c14d48726dc5b699c0c6539 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sun, 18 Jul 2021 16:43:06 +0800 Subject: [PATCH 69/82] finally(!): a draft for scatter constitent with pytroch --- oneflow/core/functional/functional_api.yaml | 4 ++++ .../core/functional/impl/array_functor.cpp | 23 +++++++++++++++++++ oneflow/python/nn/modules/scatter.py | 17 +++++++++++++- .../user/kernels/dim_scatter_kernel_util.cpp | 4 ++++ .../user/kernels/dim_scatter_kernel_util.cu | 1 + .../user/kernels/dim_scatter_kernel_util.h | 5 ++++ oneflow/user/kernels/dim_scatter_kernels.cpp | 2 ++ oneflow/user/ops/dim_scatter_ops.cpp | 2 ++ 8 files changed, 57 insertions(+), 1 deletion(-) diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 2070cb1411c..932b0798dc7 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -702,6 +702,10 @@ signature: "Tensor DimScatterAdd(Tensor input, Tensor index, Tensor src, *, Int32 dim)" bind_python: True +- name: "dim_scatter_mul" + signature: "Tensor DimScatterMul(Tensor input, Tensor index, Tensor src, *, Int32 dim)" + bind_python: True + - name: "dim_scatter_scalar" signature: "Tensor DimScatterUpdateScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" bind_python: True diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp index 7dcc7eb1208..89d907a1f3b 100644 --- a/oneflow/core/functional/impl/array_functor.cpp +++ b/oneflow/core/functional/impl/array_functor.cpp @@ -306,6 +306,28 @@ class DimScatterAddFunctor { std::shared_ptr op_; }; +class DimScatterMulFunctor { + public: + DimScatterMulFunctor() { + op_ = CHECK_JUST(one::OpBuilder("dim_scatter_mul") + .Input("input") + .Input("index") + .Input("src") + .Output("output") + .Build()); + } + Maybe operator()(const std::shared_ptr& input, + const std::shared_ptr& index, + const std::shared_ptr& src, const int32_t& dim) const { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", dim)); + return OpInterpUtil::Dispatch(*op_, {input, index, src}, attrs); + } + + private: + std::shared_ptr op_; +}; + class DimScatterUpdateScalarFunctor { public: DimScatterUpdateScalarFunctor() { @@ -1055,6 +1077,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor("TensorGetItem"); m.add_functor("DimScatter"); m.add_functor("DimScatterAdd"); + m.add_functor("DimScatterMul"); m.add_functor("DimScatterUpdateScalar"); m.add_functor("DimScatterAddScalar"); m.add_functor("DimScatterMulScalar"); diff --git a/oneflow/python/nn/modules/scatter.py b/oneflow/python/nn/modules/scatter.py index fe56b81976d..a3a8747b1dd 100644 --- a/oneflow/python/nn/modules/scatter.py +++ b/oneflow/python/nn/modules/scatter.py @@ -40,6 +40,10 @@ def forward(self, input, dim, index, src, reduce): ], "reduce must be 'add', 'multiply' or None" if isinstance(src, flow.Tensor): + if reduce == "add": + return flow.F.dim_scatter_add(input, index, src, dim) + elif reduce == "multiply": + return flow.F.dim_scatter_mul(input, index, src, dim) return flow.F.dim_scatter(input, index, src, dim) elif isinstance(src, float): if reduce == "add": @@ -73,6 +77,7 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): dim (int): The axis along which to index index (Tensor): The index blob of elements to scatter. src (Tensor or float): The source blob whose elements will be scatterd and updated to output. + reduce (string): reduction operation to apply, can be either 'add' or 'multiply'. Returns: Tensor: The scatterd Tensor. @@ -92,6 +97,16 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): tensor([[ 0., 10., 20., 2., 2.], [50., 60., 2., 2., 70.], [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) + >>> out = flow.scatter(input, 1, index, src, reduce="add") + >>> out + tensor([[ 2., 12., 22., 2., 2.], + [52., 62., 2., 2., 72.], + [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) + >>> out = flow.scatter(input, 1, index, src, reduce="multiply") + >>> out + tensor([[ 0., 20., 40., 2., 2.], + [100., 120., 2., 2., 140.], + [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) >>> out = flow.scatter(input, 1, index, 3.14) >>> out tensor([[3.14, 3.14, 3.14, 2. , 2. ], @@ -127,4 +142,4 @@ def scatter_tensor_op(input, dim, index, src, reduce: Optional[str] = None): if __name__ == "__main__": import doctest - doctest.testmod(raise_on_error=True) + doctest.testmod(raise_on_error=False) diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index ae8cbfd65ce..a4d822fb599 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -40,5 +40,9 @@ INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpUpdateFunctor); template struct DimScatterFunctor; template struct DimScatterFunctor; +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpMulFunctor); +template struct DimScatterFunctor; +template struct DimScatterFunctor; + } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 151a329b300..7f6058d9d5a 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -59,6 +59,7 @@ struct DimScatterFunctor final { INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpAddFunctor); INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpUpdateFunctor); +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpMulFunctor); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 34ff5cd31d3..4b270441a9c 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -59,6 +59,11 @@ struct BinOpUpdateFunctor { OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y = *x; } }; +template +struct BinOpMulFunctor { + OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y *= *x; } +}; + template class Opt> struct DimScatterFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 4238a5622a0..8f2ad2ca831 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -128,12 +128,14 @@ REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_add", BinOpAddFunctor); REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); +REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #ifdef WITH_CUDA REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_add", BinOpAddFunctor); REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); +REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #endif // WITH_CUDA } // namespace user_op diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 00951b02930..3f9cfd302a1 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -281,6 +281,7 @@ REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_OP("dim_scatter_add"); REGISTER_SCATTER_OP("dim_scatter_update"); +REGISTER_SCATTER_OP("dim_scatter_mul"); REGISTER_SCATTER_SCALAR_OP("dim_scatter_update_scalar"); REGISTER_SCATTER_SCALAR_OP("dim_scatter_add_scalar"); @@ -288,6 +289,7 @@ REGISTER_SCATTER_SCALAR_OP("dim_scatter_mul_scalar"); REGISTER_SCATTER_GRAD("dim_scatter_add"); REGISTER_SCATTER_GRAD("dim_scatter_update"); +REGISTER_SCATTER_GRAD("dim_scatter_mul"); REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_update_scalar"); REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_add_scalar"); From dd105c800d6e9773cb0a9902eba2fb0966cf65f0 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Tue, 27 Jul 2021 13:59:40 +0800 Subject: [PATCH 70/82] change import package name --- .../single_client/test/ops/test_dim_scatter_ops.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py b/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py index 73182652d6c..f45375499a5 100644 --- a/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py +++ b/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py @@ -13,15 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. """ -import oneflow as flow -import numpy as np -import oneflow.typing as tp -from test_util import GenArgList -import unittest from collections import OrderedDict import os import random +import numpy as np + +import oneflow.compatible.single_client.unittest +from oneflow.compatible import single_client as flow +from oneflow.compatible.single_client import typing as tp +from test_util import GenArgList + + flow.config.enable_debug_mode(True) From 620e1b203af95c30e99eaee76428b100b06c97fa Mon Sep 17 00:00:00 2001 From: YaoChi Date: Tue, 27 Jul 2021 22:38:29 +0800 Subject: [PATCH 71/82] remmove lazy test and add scatter_add and scatter_mul --- python/oneflow/__init__.py | 1 + .../test/ops/test_dim_scatter_ops.py | 488 ------------------ python/oneflow/nn/modules/scatter.py | 173 ++++--- 3 files changed, 111 insertions(+), 551 deletions(-) delete mode 100644 python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py index 381a87898e5..2ef3e37aac2 100644 --- a/python/oneflow/__init__.py +++ b/python/oneflow/__init__.py @@ -360,5 +360,6 @@ def Sync(): from oneflow.ops.user_op_builder import ( api_user_op_module_builder as user_op_module_builder, ) +from oneflow.nn.modules.scatter import * from . import autograd, distributed, linalg, optim, saved_model diff --git a/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py b/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py deleted file mode 100644 index f45375499a5..00000000000 --- a/python/oneflow/compatible/single_client/test/ops/test_dim_scatter_ops.py +++ /dev/null @@ -1,488 +0,0 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from collections import OrderedDict -import os -import random - -import numpy as np - -import oneflow.compatible.single_client.unittest -from oneflow.compatible import single_client as flow -from oneflow.compatible.single_client import typing as tp -from test_util import GenArgList - - - -flow.config.enable_debug_mode(True) - - -def _bin_add(out_val, in_value): - return out_val + in_value - - -def _bin_update(out_val, in_value): - return in_value - - -def gen_scatter_like_test_sample( - src_shape, - index_shape, - dim, - like_shape, - is_float=True, - binop=_bin_add, - inplace=True, -): - def _np_dim_scatter_add_like(like, dim, index, src): - out_shape = like.shape - flatten_idx = index.flatten() - - if inplace: - output = like.copy() - else: - output = np.zeros(out_shape) - - for idx in range(0, index.size): - idx_coord = list(np.unravel_index(idx, index.shape)) - idx_elem = flatten_idx[idx] - src_offset = np.ravel_multi_index(idx_coord, src.shape) - idx_coord[dim] = idx_elem - output_offset = np.ravel_multi_index(idx_coord, out_shape) - output[np.unravel_index(output_offset, out_shape)] = binop( - output[np.unravel_index(output_offset, out_shape)], - src[np.unravel_index(src_offset, src.shape)], - ) - return output - - if is_float: - src = np.random.random(src_shape) - like = np.random.random(like_shape) - else: - src = np.random.randint(0, 100, src_shape) - like = np.random.randint(0, 100, like_shape) - - def _np_dim_gather(dim, input, index): - output = np.zeros(index.shape) - for idx in range(0, index.size): - incoord = np.unravel_index(idx, index.shape) - outcoord = [*incoord] - incoord = [*incoord] - incoord[dim] = index[np.unravel_index(idx, index.shape)] - output[tuple(outcoord)] = input[tuple(incoord)] - return output - - shape_elemcnt = 1 - index_shape_list = list(index_shape) - for i in range(len(index_shape_list)): - shape_elemcnt *= index_shape_list[i] - - index_total = [] - for i in range(int(shape_elemcnt / like_shape[dim])): - index_arr = np.arange(0, like_shape[dim]) - random.shuffle(index_arr) - index_total.append(index_arr) - - index = np.stack(index_total) - - output = _np_dim_scatter_add_like(like, dim, index, src) - - grad = _np_dim_gather(dim, np.ones(output.shape), index) - return { - "src": src, - "index": index, - "like": like, - "dim": dim, - "output": output, - "grad": grad, - } - - -def _gen_arg_dict( - grad_flag=False, - device_type="gpu", - value_type="float", - machine_ids="0:0", - device_count=1, - binop=_bin_add, - dim_scatter_op=flow.dim_scatter_add, - inplace=True, -): - arg_dict = OrderedDict() - arg_dict["grad_flag"] = [grad_flag] - arg_dict["device_type"] = [device_type] - arg_dict["samples"] = [] - arg_dict["samples"].append( - gen_scatter_like_test_sample( - (2, 3), - (2, 3), - 1, - (2, 3), - is_float=value_type == "float", - binop=binop, - inplace=inplace, - ) - ) - if value_type == "float": - if device_type == "cpu": - arg_dict["value_type"] = [ - (np.float32, flow.float32), - ] - else: - arg_dict["value_type"] = [ - (np.float32, flow.float32), - ] - elif value_type == "int": - arg_dict["value_type"] = [(np.float32, flow.int32)] - else: - raise "float or int for value type only" - - arg_dict["index_type"] = [(np.int32, flow.int32)] - arg_dict["machine_ids"] = [machine_ids] - arg_dict["device_count"] = [device_count] - arg_dict["flow_scatter_op"] = [dim_scatter_op] - return arg_dict - - -def _make_dim_scatter_add_like_fn( - test_case, - grad_flag, - src, - index, - dim, - like, - grad, - device_type, - value_type, - index_type, - machine_ids, - device_counts, - flow_scatter_op, -): - flow.clear_default_session() - if device_type == "cpu": - flow.config.cpu_device_num(device_counts) - else: - flow.config.gpu_device_num(device_counts) - - func_config = flow.FunctionConfig() - - # global function needs float32 as type of argument and return value - if value_type == flow.float16: - func_config.default_data_type(flow.float32) - else: - func_config.default_data_type(value_type) - - func_config.default_placement_scope(flow.scope.placement(device_type, machine_ids)) - func_config.default_logical_view(flow.scope.consistent_view()) - - def _compare_diff(blob: tp.Numpy): - test_case.assertTrue(np.allclose(grad, blob)) - - if grad_flag: - if value_type == flow.float32 or value_type == flow.float64: - - @flow.global_function(type="train", function_config=func_config) - def scatter_add_like_fn( - like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), - indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), - src_def: tp.Numpy.Placeholder(src.shape, dtype=value_type), - ) -> tp.Numpy: - with flow.scope.placement(device_type, "0:0"): - src_var = flow.get_variable( - "src", - shape=src.shape, - dtype=value_type, - initializer=flow.constant_initializer(0), - ) - src_var = flow.cast_to_current_logical_view(src_var) - src_tensor = src_var + src_def - - y = flow_scatter_op(like_def, dim, indices_def, src_tensor) - - with flow.scope.placement(device_type, "0:0"): - flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), - momentum=0, - ).minimize(y) - - flow.watch_diff(src_var, _compare_diff) - return y - - if value_type == flow.int32: - - @flow.global_function(type="train", function_config=func_config) - def scatter_add_like_fn( - like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), - indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), - src_def: tp.Numpy.Placeholder(src.shape, dtype=flow.float32), - ) -> tp.Numpy: - with flow.scope.placement(device_type, "0:0"): - src_var = flow.get_variable( - "src", - shape=src_def.shape, - dtype=flow.float32, - initializer=flow.constant_initializer(0), - ) - src_var = flow.cast_to_current_logical_view(src_var) - src_tensor = src_var + src_def - - src_int32 = flow.cast(src_tensor, dtype=flow.int32) - like_def_int32 = flow.cast(like_def, dtype=flow.int32) - y_int32 = flow_scatter_op(like_def_int32, dim, indices_def, src_int32) - y_fp32 = flow.cast(y_int32, dtype=flow.float32) - - with flow.scope.placement(device_type, "0:0"): - flow.optimizer.SGD( - flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), - momentum=0, - ).minimize(y_fp32) - - flow.watch_diff(src_int32, _compare_diff) - return y_fp32 - - return scatter_add_like_fn - - else: - if value_type == flow.float32 or value_type == flow.float64: - - @flow.global_function(type="predict", function_config=func_config) - def scatter_add_like_fn( - like_def: tp.Numpy.Placeholder(like.shape, dtype=value_type), - indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), - src_def: tp.Numpy.Placeholder(src.shape, dtype=value_type), - ) -> tp.Numpy: - with flow.scope.placement(device_type, "0:0"): - src_var = flow.get_variable( - "src", - shape=src.shape, - dtype=value_type, - initializer=flow.constant_initializer(0), - ) - src_var = flow.cast_to_current_logical_view(src_var) - src_tensor = src_var + src_def - - y = flow_scatter_op(like_def, dim, indices_def, src_tensor) - return y - - if value_type == flow.int32: - - @flow.global_function(type="predict", function_config=func_config) - def scatter_add_like_fn( - like_def: tp.Numpy.Placeholder(like.shape, dtype=flow.float32), - indices_def: tp.Numpy.Placeholder(index.shape, dtype=index_type), - src_def: tp.Numpy.Placeholder(src.shape, dtype=flow.float32), - ) -> tp.Numpy: - with flow.scope.placement(device_type, "0:0"): - src_var = flow.get_variable( - "src", - shape=src_def.shape, - dtype=flow.float32, - initializer=flow.constant_initializer(0), - ) - src_var = flow.cast_to_current_logical_view(src_var) - src_tensor = src_var + src_def - - src_int32 = flow.cast(src_tensor, dtype=flow.int32) - like_def_int32 = flow.cast(like_def, dtype=flow.int32) - y_int32 = flow_scatter_op(like_def_int32, dim, indices_def, src_int32) - y_fp32 = flow.cast(y_int32, dtype=flow.int32) - - return y_fp32 - - return scatter_add_like_fn - - -def _compare_dim_scatter_op_like_with_samples( - test_case, - grad_flag, - device_type, - sample, - value_type, - index_type, - machine_ids, - device_count, - flow_scatter_op, -): - scatter_add_like_fn = _make_dim_scatter_add_like_fn( - test_case, - grad_flag, - sample["like"].astype(value_type[0]), - sample["index"].astype(index_type[0]), - sample["dim"], - sample["src"].astype(value_type[0]), - sample["grad"].astype(value_type[0]), - device_type, - value_type[1], - index_type[1], - machine_ids, - device_count, - flow_scatter_op, - ) - y = scatter_add_like_fn( - sample["like"].astype(value_type[0]), - sample["index"].astype(index_type[0]), - sample["src"].astype(value_type[0]), - ) - y = y.astype(value_type[0]) - if value_type[1] == flow.float16: - test_case.assertTrue( - np.allclose(y, sample["output"].astype(np.float32), 1e-3, 1e-3) - ) - else: - test_case.assertTrue(np.allclose(y, sample["output"].astype(value_type[0]))) - - -@flow.unittest.skip_unless_1n1d() -class TestDimScatterOpsInplace1n1d(flow.unittest.TestCase): - def test_dim_scatter_add_int_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - def test_dim_scatter_add_float_cpu(test_case): - arg_dict = _gen_arg_dict( - True, "cpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_int_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "int", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, "gpu", "float", "0:0", 1, _bin_add, flow.dim_scatter_add, inplace=True - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, - "gpu", - "float", - "0:0", - 1, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - def test_dim_scatter_update_int_cpu(test_case): - arg_dict = _gen_arg_dict( - True, - "cpu", - "int", - "0:0", - 1, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - def test_dim_scatter_update_float_cpu(test_case): - arg_dict = _gen_arg_dict( - True, - "cpu", - "float", - "0:0", - 1, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_int_gpu(test_case): - arg_dict = _gen_arg_dict( - True, - "gpu", - "int", - "0:0", - 1, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_float_gpu(test_case): - arg_dict = _gen_arg_dict( - True, - "gpu", - "float", - "0:0", - 1, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - -@flow.unittest.skip_unless_1n2d() -class TestDimScatterOpsInplace1n2d(flow.unittest.TestCase): - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_add_float(test_case): - arg_dict = _gen_arg_dict( - True, - "gpu", - "float", - "0:0-1", - 2, - _bin_add, - flow.dim_scatter_add, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_dim_scatter_update_float(test_case): - arg_dict = _gen_arg_dict( - True, - "gpu", - "float", - "0:0-1", - 2, - _bin_update, - flow.dim_scatter_update, - inplace=True, - ) - for arg in GenArgList(arg_dict): - _compare_dim_scatter_op_like_with_samples(test_case, *arg) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index 96d7a5697af..5b44f243d6b 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -13,49 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. """ -import oneflow as flow - -from oneflow.python.framework.tensor import Tensor -from oneflow.python.oneflow_export import oneflow_export, experimental_api -from oneflow.python.framework.tensor import register_tensor_op -from oneflow.python.nn.module import Module from typing import Optional, List, Tuple +import oneflow as flow +from oneflow.framework.tensor import Tensor, register_tensor_op +from oneflow.nn.module import Module -class Scatter(Module): - def __init__(self) -> None: - super().__init__() - def forward(self, input, dim, index, src, reduce): - assert type(src) in [ - flow.Tensor, - float, - ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) +__all__ = ["scatter", "scatter_add", "scatter_mul"] + - assert reduce in [ - "add", - "multiply", - None, - ], "reduce must be 'add', 'multiply' or None" - - if isinstance(src, flow.Tensor): - if reduce == "add": - return flow.F.dim_scatter_add(input, index, src, dim) - elif reduce == "multiply": - return flow.F.dim_scatter_mul(input, index, src, dim) - return flow.F.dim_scatter(input, index, src, dim) - elif isinstance(src, float): - if reduce == "add": - return flow.F.dim_scatter_add_scalar(input, index, src, dim) - elif reduce == "multiply": - return flow.F.dim_scatter_mul_scalar(input, index, src, dim) - return flow.F.dim_scatter_scalar(input, index, src, dim) - - -@oneflow_export("scatter") -@experimental_api -def scatter_op(input, dim, index, src, reduce: Optional[str] = None): +def scatter(input, dim, index, src): r"""This operator writes the elements specified by `index` along with the axis `dim` from the `src` into the `input`. @@ -77,7 +46,6 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): dim (int): The axis along which to index index (Tensor): The index blob of elements to scatter. src (Tensor or float): The source blob whose elements will be scatterd and updated to output. - reduce (string): reduction operation to apply, can be either 'add' or 'multiply'. Returns: Tensor: The scatterd Tensor. @@ -86,7 +54,7 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): .. code-block:: python - >>> import oneflow.experimental as flow + >>> import oneflow as flow >>> import numpy as np >>> input = flow.ones((3,5))*2 @@ -97,46 +65,125 @@ def scatter_op(input, dim, index, src, reduce: Optional[str] = None): tensor([[ 0., 10., 20., 2., 2.], [50., 60., 2., 2., 70.], [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) - >>> out = flow.scatter(input, 1, index, src, reduce="add") + + """ + if isinstance(src, flow.Tensor): + assert type(src) in [ + flow.Tensor, + float, + ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + + if isinstance(src, flow.Tensor): + return flow.F.dim_scatter(input, index, src, dim) + elif isinstance(src, float): + return flow.F.dim_scatter_scalar(input, index, src, dim) + + +def scatter_add(input, dim, index, src): + r"""This operator scatter the src with addition operation according to index along dim into the input. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + input[index[i][j][k]][j][k] += src[i][j][k] # if dim == 0 + input[i][index[i][j][k]][k] += src[i][j][k] # if dim == 1 + input[i][j][index[i][j][k]] += src[i][j][k] # if dim == 2 + + Args: + input (Tensor): The input blob. + dim (int): The axis along which to index + index (Tensor): The index blob of elements to scatter. + src (Tensor or float): The source blob whose elements will be scatterd and added to output. + + Returns: + Tensor: The scatterd Tensor. + + For example: + + .. code-block:: python + + >>> import oneflow as flow + >>> import numpy as np + >>> input = flow.ones((3,5))*2 + >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) + >>> src = flow.Tensor(np.array([[0,10,20,30,40],[50,60,70,80,90]])) + >>> out = flow.scatter_add(input, 1, index, src) >>> out tensor([[ 2., 12., 22., 2., 2.], [52., 62., 2., 2., 72.], [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) - >>> out = flow.scatter(input, 1, index, src, reduce="multiply") - >>> out - tensor([[ 0., 20., 40., 2., 2.], - [100., 120., 2., 2., 140.], - [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) - >>> out = flow.scatter(input, 1, index, 3.14) - >>> out - tensor([[3.14, 3.14, 3.14, 2. , 2. ], - [3.14, 3.14, 2. , 2. , 3.14], - [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) - >>> out = flow.scatter(input, 1, index, 3.14, reduce="add") + >>> out = flow.scatter_add(input, 1, index, 3.14) >>> out tensor([[5.14, 5.14, 5.14, 2. , 2. ], [5.14, 5.14, 2. , 2. , 5.14], [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) - >>> out = flow.scatter(input, 1, index, 3.14, reduce="multiply") - >>> out - tensor([[6.28, 6.28, 6.28, 2. , 2. ], - [6.28, 6.28, 2. , 2. , 6.28], - [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) """ + if isinstance(src, flow.Tensor): + assert type(src) in [ + flow.Tensor, + float, + ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + + if isinstance(src, flow.Tensor): + return flow.F.dim_scatter_add(input, index, src, dim) + elif isinstance(src, float): + return flow.F.dim_scatter_add_scalar(input, index, src, dim) + + +def scatter_mul(input, dim, index, src): + r"""This operator scatter the src with multiplying operation according to index along dim into the input. + + Take a 3-D blob as example, the output is specified by: + + .. code-block:: python + + input[index[i][j][k]][j][k] *= src[i][j][k] # if dim == 0 + input[i][index[i][j][k]][k] *= src[i][j][k] # if dim == 1 + input[i][j][index[i][j][k]] *= src[i][j][k] # if dim == 2 + + Args: + input (Tensor): The input blob. + dim (int): The axis along which to index + index (Tensor): The index blob of elements to scatter. + src (Tensor or float): The source blob whose elements will be scatterd and multiplied to output. - return Scatter()(input, dim, index, src, reduce) + Returns: + Tensor: The scatterd Tensor. + + For example: + + .. code-block:: python + >>> import oneflow as flow + >>> import numpy as np + >>> input = flow.ones((3,5))*2 + >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) + >>> src = flow.Tensor(np.array([[0,10,20,30,40],[50,60,70,80,90]])) + >>> out = flow.scatter_mul(input, 1, index, src) + >>> out + tensor([[ 0., 20., 40., 2., 2.], + [100., 120., 2., 2., 140.], + [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) + >>> out = flow.scatter_mul(input, 1, index, 3.14) + >>> out + tensor([[6.28, 6.28, 6.28, 2. , 2. ], + [6.28, 6.28, 2. , 2. , 6.28], + [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) -@register_tensor_op -@experimental_api -def scatter_tensor_op(input, dim, index, src, reduce: Optional[str] = None): - r""" - In-place version of :func:`oneflow.experimental.scatter` """ + if isinstance(src, flow.Tensor): + assert type(src) in [ + flow.Tensor, + float, + ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - return Scatter()(input, dim, index, src, reduce) + if isinstance(src, flow.Tensor): + return flow.F.dim_scatter_mul(input, index, src, dim) + elif isinstance(src, float): + return flow.F.dim_scatter_mul_scalar(input, index, src, dim) if __name__ == "__main__": From 7bfd84fe61bb5164419bf8cd0d79de36ef61665f Mon Sep 17 00:00:00 2001 From: YaoChi Date: Tue, 27 Jul 2021 22:55:58 +0800 Subject: [PATCH 72/82] startup of scatter backward op --- .../autograd/gradient_funcs/dim_scatter.cpp | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 oneflow/core/autograd/gradient_funcs/dim_scatter.cpp diff --git a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp new file mode 100644 index 00000000000..0b1f985df38 --- /dev/null +++ b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp @@ -0,0 +1,64 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/op_builder.h" +#include "oneflow/core/framework/op_expr.h" +#include "oneflow/core/framework/op_expr_helper.h" +#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" + +namespace oneflow { +namespace one { + +struct DimScatterInterpState : public OpExprInterpState { + int32_t dim; + bool requires_input; + bool requires_src; +}; + +class DimScatter : public OpExprGradFunction { + public: + Maybe Init(const OpExpr& op) override; + Maybe Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const override; + Maybe Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const override; + + private: + AttrMap base_attrs_; + std::shared_ptr dim_gather_op_; + std::shared_ptr dim_scatter_scalar_op_; +}; + +Maybe DimScatter::Init(const OpExpr& op) { + + return Maybe::Ok(); +} + +Maybe DimScatter::Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const { + + return Maybe::Ok(); +} + +Maybe DimScatter::Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const { + return Maybe::Ok(); +} + +REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter", DimScatter); + +} // namespace one +} // namespace oneflow From ee5c49ce7356ef4521efafc0dea9ef050e1a09ed Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 12:32:14 +0800 Subject: [PATCH 73/82] add backward for scatter --- .../autograd/gradient_funcs/dim_scatter.cpp | 48 ++++++++++++++++--- oneflow/core/framework/op_expr_helper.cpp | 26 ++++++++++ oneflow/core/framework/op_expr_helper.h | 8 ++++ oneflow/user/ops/dim_scatter_ops.cpp | 2 - 4 files changed, 76 insertions(+), 8 deletions(-) diff --git a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp index 0b1f985df38..c02d225750b 100644 --- a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp +++ b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp @@ -24,8 +24,8 @@ namespace one { struct DimScatterInterpState : public OpExprInterpState { int32_t dim; - bool requires_input; - bool requires_src; + bool input_requires_grad; + bool src_requires_grad; }; class DimScatter : public OpExprGradFunction { @@ -43,22 +43,58 @@ class DimScatter : public OpExprGradFunction { }; Maybe DimScatter::Init(const OpExpr& op) { - + const UserOpExpr* fw_op_expr = dynamic_cast(&op); + CHECK_NOTNULL_OR_RETURN(fw_op_expr); + base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); + const std::string& op_name = fw_op_expr->op_name(); + dim_gather_op_ = JUST(op_expr_helper::DimGatherOp(0, GradientOpName(op_name) + "0")); + dim_scatter_scalar_op_ = + JUST(op_expr_helper::DimScatterUpdateScalarOp(0, 0.0f, GradientOpName(op_name) + "1")); return Maybe::Ok(); } Maybe DimScatter::Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, - const TensorTuple& outputs, const AttrMap& attrs) const { + const TensorTuple& outputs, const AttrMap& attrs) const { + CHECK_EQ_OR_RETURN(inputs.size(), 3); + CHECK_EQ_OR_RETURN(outputs.size(), 1); + + ctx->input_requires_grad = inputs.at(0)->requires_grad(); + ctx->src_requires_grad = inputs.at(2)->requires_grad(); + if ((!ctx->input_requires_grad) && (!ctx->src_requires_grad)) { return Maybe::Ok(); } + + ctx->SaveTensorForBackward(inputs.at(1)); // index saved + ComposedAttrMap composed_attrs(attrs, base_attrs_); + ctx->dim = JUST(composed_attrs.GetAttr("dim")); return Maybe::Ok(); } Maybe DimScatter::Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, - TensorTuple* in_grads) const { + TensorTuple* in_grads) const { + if ((!ctx->input_requires_grad) && (!ctx->src_requires_grad)) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(out_grads.size(), 1); + const std::shared_ptr& index = ctx->SavedTensors().at(0); + + in_grads->resize(3); + + if (ctx->input_requires_grad) { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", ctx->dim)); + JUST(attrs.SetAttr("src_scalar", 0.0f)); + in_grads->at(0) = JUST( + OpInterpUtil::Dispatch(*dim_scatter_scalar_op_, {out_grads.at(0), index}, attrs)); + } + + if (ctx->src_requires_grad) { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", ctx->dim)); + in_grads->at(2) = + JUST(OpInterpUtil::Dispatch(*dim_gather_op_, {out_grads.at(0), index}, attrs)); + } return Maybe::Ok(); } -REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter", DimScatter); +REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter_update", DimScatter); } // namespace one } // namespace oneflow diff --git a/oneflow/core/framework/op_expr_helper.cpp b/oneflow/core/framework/op_expr_helper.cpp index 77828308749..9aa05d95558 100644 --- a/oneflow/core/framework/op_expr_helper.cpp +++ b/oneflow/core/framework/op_expr_helper.cpp @@ -654,6 +654,32 @@ Maybe DimScatterAddLikeOp(const int32_t dim, const std::string& .Build(); } +Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value) { + return DimScatterAddLikeOp(dim, UniqueOpName("dim_scatter_update_scalar")); +} +Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value, + const std::string& name) { + return one::OpBuilder("dim_scatter_update_scalar", name) + .Input("input") + .Input("index") + .Output("output") + .Attr("dim", dim) + .Attr("src_scalar", value) + .Build(); +} + +Maybe DimGatherOp(const int32_t dim) { + return DimGatherOp(dim, UniqueOpName("dim_gather")); +} +Maybe DimGatherOp(const int32_t dim, const std::string& name) { + return one::OpBuilder("dim_gather", name) + .Input("input") + .Input("index") + .Output("output") + .Attr("dim", dim) + .Build(); +} + Maybe TransposeOp(const std::vector& perm) { return TransposeOp(perm, UniqueOpName("transpose")); } diff --git a/oneflow/core/framework/op_expr_helper.h b/oneflow/core/framework/op_expr_helper.h index a25c9b66a53..23d2f33ed0e 100644 --- a/oneflow/core/framework/op_expr_helper.h +++ b/oneflow/core/framework/op_expr_helper.h @@ -281,5 +281,13 @@ Maybe UnsortedSegmentSumLikeOp(const int64_t& axis, const std:: Maybe SoftmaxGradOp(); Maybe SoftmaxGradOp(const std::string& name); + +Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value); +Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value, + const std::string& name); + +Maybe DimGatherOp(const int32_t dim); +Maybe DimGatherOp(const int32_t dim, const std::string& name); + } // namespace op_expr_helper } // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index 40eba48ea8e..f83fffd8a92 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -291,10 +291,8 @@ REGISTER_SCATTER_SCALAR_OP("dim_scatter_mul_scalar"); REGISTER_SCATTER_GRAD("dim_scatter_add"); REGISTER_SCATTER_GRAD("dim_scatter_update"); -REGISTER_SCATTER_GRAD("dim_scatter_mul"); REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_update_scalar"); REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_add_scalar"); -REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_mul_scalar"); } // namespace user_op } // namespace oneflow From 15fe104c48e2421b780fa07fabd5d5284b831370 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 18:27:45 +0800 Subject: [PATCH 74/82] scatter ops backward finished --- .../autograd/gradient_funcs/dim_scatter.cpp | 118 +++++++++++++++--- oneflow/user/ops/dim_scatter_ops.cpp | 1 - python/oneflow/nn/modules/scatter.py | 84 ++----------- 3 files changed, 114 insertions(+), 89 deletions(-) diff --git a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp index c02d225750b..26b64118972 100644 --- a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp +++ b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp @@ -28,6 +28,9 @@ struct DimScatterInterpState : public OpExprInterpState { bool src_requires_grad; }; +enum SCATTER_TYPE { SCATTER_UPDATE, SCATTER_ADD }; + +template class DimScatter : public OpExprGradFunction { public: Maybe Init(const OpExpr& op) override; @@ -35,6 +38,8 @@ class DimScatter : public OpExprGradFunction { const TensorTuple& outputs, const AttrMap& attrs) const override; Maybe Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, TensorTuple* in_grads) const override; + Maybe ApplyCommon(const DimScatterInterpState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const; private: AttrMap base_attrs_; @@ -42,7 +47,8 @@ class DimScatter : public OpExprGradFunction { std::shared_ptr dim_scatter_scalar_op_; }; -Maybe DimScatter::Init(const OpExpr& op) { +template +Maybe DimScatter::Init(const OpExpr& op) { const UserOpExpr* fw_op_expr = dynamic_cast(&op); CHECK_NOTNULL_OR_RETURN(fw_op_expr); base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); @@ -53,8 +59,9 @@ Maybe DimScatter::Init(const OpExpr& op) { return Maybe::Ok(); } -Maybe DimScatter::Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, - const TensorTuple& outputs, const AttrMap& attrs) const { +template +Maybe DimScatter::Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const { CHECK_EQ_OR_RETURN(inputs.size(), 3); CHECK_EQ_OR_RETURN(outputs.size(), 1); @@ -69,32 +76,115 @@ Maybe DimScatter::Capture(DimScatterInterpState* ctx, const TensorTuple& i return Maybe::Ok(); } -Maybe DimScatter::Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, - TensorTuple* in_grads) const { - if ((!ctx->input_requires_grad) && (!ctx->src_requires_grad)) { return Maybe::Ok(); } - CHECK_EQ_OR_RETURN(out_grads.size(), 1); +template +Maybe DimScatter::ApplyCommon(const DimScatterInterpState* ctx, + const TensorTuple& out_grads, TensorTuple* in_grads) const { const std::shared_ptr& index = ctx->SavedTensors().at(0); in_grads->resize(3); + if (ctx->src_requires_grad) { + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", ctx->dim)); + in_grads->at(2) = + JUST(OpInterpUtil::Dispatch(*dim_gather_op_, {out_grads.at(0), index}, attrs)); + } + return Maybe::Ok(); +} + +template<> +Maybe DimScatter::Apply(const DimScatterInterpState* ctx, + const TensorTuple& out_grads, + TensorTuple* in_grads) const { + if ((!ctx->input_requires_grad) && (!ctx->src_requires_grad)) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(out_grads.size(), 1); + JUST(ApplyCommon(ctx, out_grads, in_grads)); + if (ctx->input_requires_grad) { + const std::shared_ptr& index = ctx->SavedTensors().at(0); MutableAttrMap attrs; JUST(attrs.SetAttr("dim", ctx->dim)); JUST(attrs.SetAttr("src_scalar", 0.0f)); in_grads->at(0) = JUST( OpInterpUtil::Dispatch(*dim_scatter_scalar_op_, {out_grads.at(0), index}, attrs)); } + return Maybe::Ok(); +} + +template<> +Maybe DimScatter::Apply(const DimScatterInterpState* ctx, + const TensorTuple& out_grads, + TensorTuple* in_grads) const { + if ((!ctx->input_requires_grad) && (!ctx->src_requires_grad)) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(out_grads.size(), 1); + + JUST(ApplyCommon(ctx, out_grads, in_grads)); + + if (ctx->input_requires_grad) { in_grads->at(0) = out_grads.at(0); } + + return Maybe::Ok(); +} + +class DimScatterUpdateScalar : public OpExprGradFunction { + public: + Maybe Init(const OpExpr& op) override; + Maybe Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, const AttrMap& attrs) const override; + Maybe Apply(const DimScatterInterpState* ctx, const TensorTuple& out_grads, + TensorTuple* in_grads) const override; + + private: + AttrMap base_attrs_; + std::shared_ptr dim_scatter_scalar_op_; +}; + +Maybe DimScatterUpdateScalar::Init(const OpExpr& op) { + const UserOpExpr* fw_op_expr = dynamic_cast(&op); + CHECK_NOTNULL_OR_RETURN(fw_op_expr); + base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); + const std::string& op_name = fw_op_expr->op_name(); + dim_scatter_scalar_op_ = + JUST(op_expr_helper::DimScatterUpdateScalarOp(0, 0.0f, GradientOpName(op_name))); + return Maybe::Ok(); +} + +Maybe DimScatterUpdateScalar::Capture(DimScatterInterpState* ctx, const TensorTuple& inputs, + const TensorTuple& outputs, + const AttrMap& attrs) const { + CHECK_EQ_OR_RETURN(inputs.size(), 2); + CHECK_EQ_OR_RETURN(outputs.size(), 1); + + ctx->input_requires_grad = inputs.at(0)->requires_grad(); + if (!ctx->input_requires_grad) { return Maybe::Ok(); } + + ctx->SaveTensorForBackward(inputs.at(1)); // index saved + + ComposedAttrMap composed_attrs(attrs, base_attrs_); + ctx->dim = JUST(composed_attrs.GetAttr("dim")); + return Maybe::Ok(); +} + +Maybe DimScatterUpdateScalar::Apply(const DimScatterInterpState* ctx, + const TensorTuple& out_grads, + TensorTuple* in_grads) const { + if (!ctx->input_requires_grad) { return Maybe::Ok(); } + CHECK_EQ_OR_RETURN(out_grads.size(), 1); + const std::shared_ptr& index = ctx->SavedTensors().at(0); + + in_grads->resize(2); + + MutableAttrMap attrs; + JUST(attrs.SetAttr("dim", ctx->dim)); + JUST(attrs.SetAttr("src_scalar", 0.0f)); + in_grads->at(0) = JUST( + OpInterpUtil::Dispatch(*dim_scatter_scalar_op_, {out_grads.at(0), index}, attrs)); - if (ctx->src_requires_grad) { - MutableAttrMap attrs; - JUST(attrs.SetAttr("dim", ctx->dim)); - in_grads->at(2) = - JUST(OpInterpUtil::Dispatch(*dim_gather_op_, {out_grads.at(0), index}, attrs)); - } return Maybe::Ok(); } -REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter_update", DimScatter); +REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter_update", DimScatter); +REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter_add", DimScatter); +REGISTER_OP_EXPR_GRAD_FUNCTION("dim_scatter_update_scalar", DimScatterUpdateScalar); } // namespace one } // namespace oneflow diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index f83fffd8a92..d263d304694 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -293,6 +293,5 @@ REGISTER_SCATTER_GRAD("dim_scatter_add"); REGISTER_SCATTER_GRAD("dim_scatter_update"); REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_update_scalar"); -REGISTER_SCATTER_SCALAR_GRAD("dim_scatter_add_scalar"); } // namespace user_op } // namespace oneflow diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index 5b44f243d6b..45f05e61079 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -21,7 +21,7 @@ from oneflow.nn.module import Module -__all__ = ["scatter", "scatter_add", "scatter_mul"] +__all__ = ["scatter", "scatter_add"] def scatter(input, dim, index, src): @@ -67,11 +67,10 @@ def scatter(input, dim, index, src): [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) """ - if isinstance(src, flow.Tensor): - assert type(src) in [ - flow.Tensor, - float, - ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) + assert type(src) in [ + flow.Tensor, + float, + ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) if isinstance(src, flow.Tensor): return flow.F.dim_scatter(input, index, src, dim) @@ -94,7 +93,7 @@ def scatter_add(input, dim, index, src): input (Tensor): The input blob. dim (int): The axis along which to index index (Tensor): The index blob of elements to scatter. - src (Tensor or float): The source blob whose elements will be scatterd and added to output. + src (Tensor): The source blob whose elements will be scatterd and added to output. Returns: Tensor: The scatterd Tensor. @@ -113,77 +112,14 @@ def scatter_add(input, dim, index, src): tensor([[ 2., 12., 22., 2., 2.], [52., 62., 2., 2., 72.], [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) - >>> out = flow.scatter_add(input, 1, index, 3.14) - >>> out - tensor([[5.14, 5.14, 5.14, 2. , 2. ], - [5.14, 5.14, 2. , 2. , 5.14], - [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) """ - if isinstance(src, flow.Tensor): - assert type(src) in [ - flow.Tensor, - float, - ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - - if isinstance(src, flow.Tensor): - return flow.F.dim_scatter_add(input, index, src, dim) - elif isinstance(src, float): - return flow.F.dim_scatter_add_scalar(input, index, src, dim) - - -def scatter_mul(input, dim, index, src): - r"""This operator scatter the src with multiplying operation according to index along dim into the input. - - Take a 3-D blob as example, the output is specified by: - - .. code-block:: python - - input[index[i][j][k]][j][k] *= src[i][j][k] # if dim == 0 - input[i][index[i][j][k]][k] *= src[i][j][k] # if dim == 1 - input[i][j][index[i][j][k]] *= src[i][j][k] # if dim == 2 - - Args: - input (Tensor): The input blob. - dim (int): The axis along which to index - index (Tensor): The index blob of elements to scatter. - src (Tensor or float): The source blob whose elements will be scatterd and multiplied to output. - - Returns: - Tensor: The scatterd Tensor. - For example: - - .. code-block:: python + assert type(src) in [ + flow.Tensor + ], f"type of src must be oneflow.Tensor, but %s givien" % type(src) - >>> import oneflow as flow - >>> import numpy as np - >>> input = flow.ones((3,5))*2 - >>> index = flow.tensor(np.array([[0,1,2],[0,1,4]], ), dtype=flow.int32) - >>> src = flow.Tensor(np.array([[0,10,20,30,40],[50,60,70,80,90]])) - >>> out = flow.scatter_mul(input, 1, index, src) - >>> out - tensor([[ 0., 20., 40., 2., 2.], - [100., 120., 2., 2., 140.], - [ 2., 2., 2., 2., 2.]], dtype=oneflow.float32) - >>> out = flow.scatter_mul(input, 1, index, 3.14) - >>> out - tensor([[6.28, 6.28, 6.28, 2. , 2. ], - [6.28, 6.28, 2. , 2. , 6.28], - [2. , 2. , 2. , 2. , 2. ]], dtype=oneflow.float32) - - - """ - if isinstance(src, flow.Tensor): - assert type(src) in [ - flow.Tensor, - float, - ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - - if isinstance(src, flow.Tensor): - return flow.F.dim_scatter_mul(input, index, src, dim) - elif isinstance(src, float): - return flow.F.dim_scatter_mul_scalar(input, index, src, dim) + return flow.F.dim_scatter_add(input, index, src, dim) if __name__ == "__main__": From beac5127abe83af8ff6623d6d9388e8e545ebe3a Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 19:15:03 +0800 Subject: [PATCH 75/82] add scatter, scatter_add test cases --- python/oneflow/nn/modules/scatter.py | 6 +- .../oneflow/test/modules/test_scatter_ops.py | 64 +++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 python/oneflow/test/modules/test_scatter_ops.py diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index 45f05e61079..e3ce8d9be14 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -70,9 +70,10 @@ def scatter(input, dim, index, src): assert type(src) in [ flow.Tensor, float, + flow._oneflow_internal.Tensor, ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - if isinstance(src, flow.Tensor): + if isinstance(src, flow.Tensor) or isinstance(src, flow._oneflow_internal.Tensor): return flow.F.dim_scatter(input, index, src, dim) elif isinstance(src, float): return flow.F.dim_scatter_scalar(input, index, src, dim) @@ -116,7 +117,8 @@ def scatter_add(input, dim, index, src): """ assert type(src) in [ - flow.Tensor + flow.Tensor, + flow._oneflow_internal.Tensor, ], f"type of src must be oneflow.Tensor, but %s givien" % type(src) return flow.F.dim_scatter_add(input, index, src, dim) diff --git a/python/oneflow/test/modules/test_scatter_ops.py b/python/oneflow/test/modules/test_scatter_ops.py new file mode 100644 index 00000000000..bf65096f25f --- /dev/null +++ b/python/oneflow/test/modules/test_scatter_ops.py @@ -0,0 +1,64 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import unittest +from collections import OrderedDict + +import numpy as np +from test_util import GenArgList + +import oneflow as flow +import oneflow.unittest +from automated_test_util import * + + +@flow.unittest.skip_unless_1n1d() +class TestClampModule(flow.unittest.TestCase): + @autotest() + def test_scatter_random_data(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter(input, 0, index, src) + return y + + @autotest() + def test_scatter_scalar_random_data(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter(input, 0, index, 3.14) + return y + + @autotest() + def test_scatter_add_random_data(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter_add(input, 0, index, src) + return y + + +if __name__ == "__main__": + unittest.main() From e6ed34ab1b595ec5835eb91df82837d6e8c0ac22 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 21:11:12 +0800 Subject: [PATCH 76/82] remove useless scatter_update_like --- oneflow/user/kernels/dim_scatter_kernels.cpp | 2 - oneflow/user/ops/dim_scatter_ops.cpp | 1 - python/oneflow/nn/modules/scatter.py | 8 ++-- .../oneflow/test/modules/test_scatter_ops.py | 42 +++++++++++++++---- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 8f2ad2ca831..2701ee99e19 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -126,14 +126,12 @@ class DimScatterKernel final : public user_op::OpKernel { REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_add", BinOpAddFunctor); -REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #ifdef WITH_CUDA REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_add", BinOpAddFunctor); -REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_update_like", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #endif // WITH_CUDA diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp index d263d304694..1a8f71d2439 100644 --- a/oneflow/user/ops/dim_scatter_ops.cpp +++ b/oneflow/user/ops/dim_scatter_ops.cpp @@ -280,7 +280,6 @@ Maybe ScatterBackward(user_op::BackwardOpConfContext* ctx) { }); REGISTER_SCATTER_LIKE_OP("dim_scatter_add_like"); -REGISTER_SCATTER_LIKE_OP("dim_scatter_update_like"); REGISTER_SCATTER_OP("dim_scatter_add"); REGISTER_SCATTER_OP("dim_scatter_update"); REGISTER_SCATTER_OP("dim_scatter_mul"); diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index e3ce8d9be14..fef77c9c102 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -69,11 +69,10 @@ def scatter(input, dim, index, src): """ assert type(src) in [ flow.Tensor, - float, - flow._oneflow_internal.Tensor, + float ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) - if isinstance(src, flow.Tensor) or isinstance(src, flow._oneflow_internal.Tensor): + if isinstance(src, flow.Tensor): return flow.F.dim_scatter(input, index, src, dim) elif isinstance(src, float): return flow.F.dim_scatter_scalar(input, index, src, dim) @@ -117,8 +116,7 @@ def scatter_add(input, dim, index, src): """ assert type(src) in [ - flow.Tensor, - flow._oneflow_internal.Tensor, + flow.Tensor ], f"type of src must be oneflow.Tensor, but %s givien" % type(src) return flow.F.dim_scatter_add(input, index, src, dim) diff --git a/python/oneflow/test/modules/test_scatter_ops.py b/python/oneflow/test/modules/test_scatter_ops.py index bf65096f25f..0761922e1db 100644 --- a/python/oneflow/test/modules/test_scatter_ops.py +++ b/python/oneflow/test/modules/test_scatter_ops.py @@ -14,10 +14,6 @@ limitations under the License. """ import unittest -from collections import OrderedDict - -import numpy as np -from test_util import GenArgList import oneflow as flow import oneflow.unittest @@ -27,7 +23,7 @@ @flow.unittest.skip_unless_1n1d() class TestClampModule(flow.unittest.TestCase): @autotest() - def test_scatter_random_data(test_case): + def test_scatter_random_data_at_dim_0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) @@ -38,7 +34,18 @@ def test_scatter_random_data(test_case): return y @autotest() - def test_scatter_scalar_random_data(test_case): + def test_scatter_random_data_at_dim_1(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter(input, 1, index, src) + return y + + @autotest() + def test_scatter_scalar_random_data_at_dim0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) @@ -49,7 +56,18 @@ def test_scatter_scalar_random_data(test_case): return y @autotest() - def test_scatter_add_random_data(test_case): + def test_scatter_scalar_random_data_at_dim1(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter(input, 1, index, 3.14) + return y + + @autotest() + def test_scatter_add_random_data_at_dim0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) @@ -59,6 +77,16 @@ def test_scatter_add_random_data(test_case): y = torch.scatter_add(input, 0, index, src) return y + @autotest() + def test_scatter_add_random_data_at_dim1(test_case): + device = random_device() + input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) + index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( + device + ) + y = torch.scatter_add(input, 1, index, src) + return y if __name__ == "__main__": unittest.main() From 6236405dafc0ce56c8123f0ffde60e96c3e19a0e Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 21:11:49 +0800 Subject: [PATCH 77/82] reformat --- python/oneflow/nn/modules/scatter.py | 2 +- python/oneflow/test/modules/test_scatter_ops.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index fef77c9c102..45f05e61079 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -69,7 +69,7 @@ def scatter(input, dim, index, src): """ assert type(src) in [ flow.Tensor, - float + float, ], f"type of src must be oneflow.Tensor or float, but %s givien" % type(src) if isinstance(src, flow.Tensor): diff --git a/python/oneflow/test/modules/test_scatter_ops.py b/python/oneflow/test/modules/test_scatter_ops.py index 0761922e1db..bb7041904f4 100644 --- a/python/oneflow/test/modules/test_scatter_ops.py +++ b/python/oneflow/test/modules/test_scatter_ops.py @@ -88,5 +88,6 @@ def test_scatter_add_random_data_at_dim1(test_case): y = torch.scatter_add(input, 1, index, src) return y + if __name__ == "__main__": unittest.main() From 5f1c45d14d13c0787ddf84cce46a2ed8cd511a1f Mon Sep 17 00:00:00 2001 From: YaoChi Date: Wed, 28 Jul 2021 21:18:35 +0800 Subject: [PATCH 78/82] refine test cases --- python/oneflow/nn/modules/scatter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/oneflow/nn/modules/scatter.py b/python/oneflow/nn/modules/scatter.py index 45f05e61079..7572e9d357a 100644 --- a/python/oneflow/nn/modules/scatter.py +++ b/python/oneflow/nn/modules/scatter.py @@ -14,10 +14,8 @@ limitations under the License. """ -from typing import Optional, List, Tuple - import oneflow as flow -from oneflow.framework.tensor import Tensor, register_tensor_op +from oneflow.framework.tensor import Tensor from oneflow.nn.module import Module From bad64a92626f8f930e5730bdfd676375fe81f222 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Thu, 29 Jul 2021 12:18:12 +0800 Subject: [PATCH 79/82] refine according to comments --- docs/source/oneflow.rst | 2 ++ .../autograd/gradient_funcs/dim_scatter.cpp | 28 +++++-------------- oneflow/core/functional/functional_api.yaml | 8 ------ .../user/kernels/dim_scatter_kernel_util.cpp | 9 ------ .../user/kernels/dim_scatter_kernel_util.cu | 1 - .../user/kernels/dim_scatter_kernel_util.h | 5 ---- oneflow/user/kernels/dim_scatter_kernels.cpp | 2 -- .../dim_scatter_scalar_kernel_util.cpp | 9 ------ .../kernels/dim_scatter_scalar_kernel_util.cu | 1 - .../kernels/dim_scatter_scalar_kernel_util.h | 5 ---- .../kernels/dim_scatter_scalar_kernels.cpp | 2 -- .../oneflow/test/modules/test_scatter_ops.py | 2 +- 12 files changed, 10 insertions(+), 64 deletions(-) diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst index 4e0b646e600..e2befa2ae1a 100644 --- a/docs/source/oneflow.rst +++ b/docs/source/oneflow.rst @@ -27,6 +27,8 @@ oneflow reshape, save, saved_model, + scatter, + scatter_add, scatter_nd, slice, slice_update, diff --git a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp index 26b64118972..6bda00e3abc 100644 --- a/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp +++ b/oneflow/core/autograd/gradient_funcs/dim_scatter.cpp @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/framework/op_expr.h" #include "oneflow/core/framework/op_expr_helper.h" #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h" +#include "oneflow/core/functional/functional.h" namespace oneflow { namespace one { @@ -43,8 +44,6 @@ class DimScatter : public OpExprGradFunction { private: AttrMap base_attrs_; - std::shared_ptr dim_gather_op_; - std::shared_ptr dim_scatter_scalar_op_; }; template @@ -52,10 +51,6 @@ Maybe DimScatter::Init(const OpExpr& op) { const UserOpExpr* fw_op_expr = dynamic_cast(&op); CHECK_NOTNULL_OR_RETURN(fw_op_expr); base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); - const std::string& op_name = fw_op_expr->op_name(); - dim_gather_op_ = JUST(op_expr_helper::DimGatherOp(0, GradientOpName(op_name) + "0")); - dim_scatter_scalar_op_ = - JUST(op_expr_helper::DimScatterUpdateScalarOp(0, 0.0f, GradientOpName(op_name) + "1")); return Maybe::Ok(); } @@ -84,10 +79,7 @@ Maybe DimScatter::ApplyCommon(const DimScatterInterpState* ctx, in_grads->resize(3); if (ctx->src_requires_grad) { - MutableAttrMap attrs; - JUST(attrs.SetAttr("dim", ctx->dim)); - in_grads->at(2) = - JUST(OpInterpUtil::Dispatch(*dim_gather_op_, {out_grads.at(0), index}, attrs)); + in_grads->at(2) = JUST(functional::DimGather(out_grads.at(0), index, ctx->dim)); } return Maybe::Ok(); } @@ -102,11 +94,8 @@ Maybe DimScatter::Apply(const DimScatterInte if (ctx->input_requires_grad) { const std::shared_ptr& index = ctx->SavedTensors().at(0); - MutableAttrMap attrs; - JUST(attrs.SetAttr("dim", ctx->dim)); - JUST(attrs.SetAttr("src_scalar", 0.0f)); - in_grads->at(0) = JUST( - OpInterpUtil::Dispatch(*dim_scatter_scalar_op_, {out_grads.at(0), index}, attrs)); + in_grads->at(0) = + JUST(functional::DimScatterUpdateScalar(out_grads.at(0), index, 0.0f, ctx->dim)); } return Maybe::Ok(); } @@ -135,16 +124,13 @@ class DimScatterUpdateScalar : public OpExprGradFunction private: AttrMap base_attrs_; - std::shared_ptr dim_scatter_scalar_op_; }; Maybe DimScatterUpdateScalar::Init(const OpExpr& op) { const UserOpExpr* fw_op_expr = dynamic_cast(&op); CHECK_NOTNULL_OR_RETURN(fw_op_expr); base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto()); - const std::string& op_name = fw_op_expr->op_name(); - dim_scatter_scalar_op_ = - JUST(op_expr_helper::DimScatterUpdateScalarOp(0, 0.0f, GradientOpName(op_name))); + return Maybe::Ok(); } @@ -176,8 +162,8 @@ Maybe DimScatterUpdateScalar::Apply(const DimScatterInterpState* ctx, MutableAttrMap attrs; JUST(attrs.SetAttr("dim", ctx->dim)); JUST(attrs.SetAttr("src_scalar", 0.0f)); - in_grads->at(0) = JUST( - OpInterpUtil::Dispatch(*dim_scatter_scalar_op_, {out_grads.at(0), index}, attrs)); + in_grads->at(0) = + JUST(functional::DimScatterUpdateScalar(out_grads.at(0), index, 0.0f, ctx->dim);); return Maybe::Ok(); } diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml index 012020aac86..bb6cba2acd7 100644 --- a/oneflow/core/functional/functional_api.yaml +++ b/oneflow/core/functional/functional_api.yaml @@ -738,10 +738,6 @@ signature: "Tensor DimScatterAdd(Tensor input, Tensor index, Tensor src, *, Int32 dim)" bind_python: True -- name: "dim_scatter_mul" - signature: "Tensor DimScatterMul(Tensor input, Tensor index, Tensor src, *, Int32 dim)" - bind_python: True - - name: "dim_scatter_scalar" signature: "Tensor DimScatterUpdateScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" bind_python: True @@ -750,10 +746,6 @@ signature: "Tensor DimScatterAddScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" bind_python: True -- name: "dim_scatter_mul_scalar" - signature: "Tensor DimScatterMulScalar(Tensor input, Tensor index, *, Float src, Int32 dim)" - bind_python: True - - name: "tensor_setitem" signature: "Void TensorSetItem(Tensor x, *, TensorIndex index, Tensor value)" bind_python: True diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.cpp index a4d822fb599..796ba608f15 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cpp @@ -33,16 +33,7 @@ struct DimScatterFunctor final { }; INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpAddFunctor); -template struct DimScatterFunctor; -template struct DimScatterFunctor; - INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpUpdateFunctor); -template struct DimScatterFunctor; -template struct DimScatterFunctor; - -INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCPU, BinOpMulFunctor); -template struct DimScatterFunctor; -template struct DimScatterFunctor; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.cu b/oneflow/user/kernels/dim_scatter_kernel_util.cu index 7f6058d9d5a..151a329b300 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_kernel_util.cu @@ -59,7 +59,6 @@ struct DimScatterFunctor final { INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpAddFunctor); INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpUpdateFunctor); -INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kGPU, BinOpMulFunctor); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.h b/oneflow/user/kernels/dim_scatter_kernel_util.h index 4b270441a9c..34ff5cd31d3 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_kernel_util.h @@ -59,11 +59,6 @@ struct BinOpUpdateFunctor { OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y = *x; } }; -template -struct BinOpMulFunctor { - OF_DEVICE_FUNC static void apply(const T* x, T* y) { *y *= *x; } -}; - template class Opt> struct DimScatterFunctor final { void operator()(DeviceCtx* ctx, const DimOpIndexNdHelper& src_nd_helper, diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 2701ee99e19..087d0812ba3 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -127,13 +127,11 @@ class DimScatterKernel final : public user_op::OpKernel { REGISTER_DIM_SCATTER_LIKE_CPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_add", BinOpAddFunctor); REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); -REGISTER_DIM_SCATTER_CPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #ifdef WITH_CUDA REGISTER_DIM_SCATTER_LIKE_GPU_KERNELS("dim_scatter_add_like", BinOpAddFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_add", BinOpAddFunctor); REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_update", BinOpUpdateFunctor); -REGISTER_DIM_SCATTER_GPU_KERNELS("dim_scatter_mul", BinOpMulFunctor); #endif // WITH_CUDA } // namespace user_op diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp index b1e19e727e4..df6c91a0323 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cpp @@ -31,16 +31,7 @@ struct DimScatterScalarFunctor final { }; INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, UpdateScalarFunctor); -template struct DimScatterScalarFunctor; -template struct DimScatterScalarFunctor; - INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, AddScalarFunctor); -template struct DimScatterScalarFunctor; -template struct DimScatterScalarFunctor; - -INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCPU, MulScalarFunctor); -template struct DimScatterScalarFunctor; -template struct DimScatterScalarFunctor; } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu index 16d259e6cac..abe4b3cd301 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.cu @@ -44,7 +44,6 @@ struct DimScatterScalarFunctor final { INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, UpdateScalarFunctor); INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, AddScalarFunctor); -INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kGPU, MulScalarFunctor); } // namespace user_op } // namespace oneflow diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h index a3656eda037..199cdc7901d 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.h @@ -46,11 +46,6 @@ struct UpdateScalarFunctor { OF_DEVICE_FUNC static void apply(const T x, T* y) { *y = x; } }; -template -struct MulScalarFunctor { - OF_DEVICE_FUNC static void apply(const T x, T* y) { *y *= x; } -}; - #define INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(device_type, opt) \ template struct DimScatterScalarFunctor; \ template struct DimScatterScalarFunctor; \ diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index 3396d2220ca..154ab129b56 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -91,12 +91,10 @@ class DimScatterScalarKernel final : public user_op::OpKernel { REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_update_scalar", UpdateScalarFunctor); REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_add_scalar", AddScalarFunctor); -REGISTER_SCATTER_SCALAR_CPU_KERNELS("dim_scatter_mul_scalar", MulScalarFunctor); #ifdef WITH_CUDA REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_update_scalar", UpdateScalarFunctor); REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_add_scalar", AddScalarFunctor); -REGISTER_SCATTER_SCALAR_GPU_KERNELS("dim_scatter_mul_scalar", MulScalarFunctor); #endif // WITH_CUDA } // namespace user_op diff --git a/python/oneflow/test/modules/test_scatter_ops.py b/python/oneflow/test/modules/test_scatter_ops.py index bb7041904f4..33c144d8d32 100644 --- a/python/oneflow/test/modules/test_scatter_ops.py +++ b/python/oneflow/test/modules/test_scatter_ops.py @@ -21,7 +21,7 @@ @flow.unittest.skip_unless_1n1d() -class TestClampModule(flow.unittest.TestCase): +class TestScatterOpsModule(flow.unittest.TestCase): @autotest() def test_scatter_random_data_at_dim_0(test_case): device = random_device() From c875bd1e6e46462ca86dd76f9370bb959debdb71 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Thu, 29 Jul 2021 12:25:33 +0800 Subject: [PATCH 80/82] revert op_exprt_helper --- oneflow/core/framework/op_expr_helper.cpp | 26 ----------------------- oneflow/core/framework/op_expr_helper.h | 8 ------- 2 files changed, 34 deletions(-) diff --git a/oneflow/core/framework/op_expr_helper.cpp b/oneflow/core/framework/op_expr_helper.cpp index 9aa05d95558..77828308749 100644 --- a/oneflow/core/framework/op_expr_helper.cpp +++ b/oneflow/core/framework/op_expr_helper.cpp @@ -654,32 +654,6 @@ Maybe DimScatterAddLikeOp(const int32_t dim, const std::string& .Build(); } -Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value) { - return DimScatterAddLikeOp(dim, UniqueOpName("dim_scatter_update_scalar")); -} -Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value, - const std::string& name) { - return one::OpBuilder("dim_scatter_update_scalar", name) - .Input("input") - .Input("index") - .Output("output") - .Attr("dim", dim) - .Attr("src_scalar", value) - .Build(); -} - -Maybe DimGatherOp(const int32_t dim) { - return DimGatherOp(dim, UniqueOpName("dim_gather")); -} -Maybe DimGatherOp(const int32_t dim, const std::string& name) { - return one::OpBuilder("dim_gather", name) - .Input("input") - .Input("index") - .Output("output") - .Attr("dim", dim) - .Build(); -} - Maybe TransposeOp(const std::vector& perm) { return TransposeOp(perm, UniqueOpName("transpose")); } diff --git a/oneflow/core/framework/op_expr_helper.h b/oneflow/core/framework/op_expr_helper.h index 23d2f33ed0e..a25c9b66a53 100644 --- a/oneflow/core/framework/op_expr_helper.h +++ b/oneflow/core/framework/op_expr_helper.h @@ -281,13 +281,5 @@ Maybe UnsortedSegmentSumLikeOp(const int64_t& axis, const std:: Maybe SoftmaxGradOp(); Maybe SoftmaxGradOp(const std::string& name); - -Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value); -Maybe DimScatterUpdateScalarOp(const int32_t dim, const float value, - const std::string& name); - -Maybe DimGatherOp(const int32_t dim); -Maybe DimGatherOp(const int32_t dim, const std::string& name); - } // namespace op_expr_helper } // namespace oneflow From bf787c602673ccff04008535d062e9cf95acd9f5 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Fri, 30 Jul 2021 09:41:26 +0800 Subject: [PATCH 81/82] fixed index element --- .../oneflow/test/modules/test_scatter_ops.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/python/oneflow/test/modules/test_scatter_ops.py b/python/oneflow/test/modules/test_scatter_ops.py index 33c144d8d32..aa1c01e4111 100644 --- a/python/oneflow/test/modules/test_scatter_ops.py +++ b/python/oneflow/test/modules/test_scatter_ops.py @@ -17,73 +17,74 @@ import oneflow as flow import oneflow.unittest +import numpy as np from automated_test_util import * @flow.unittest.skip_unless_1n1d() class TestScatterOpsModule(flow.unittest.TestCase): - @autotest() + @autotest(n=5) def test_scatter_random_data_at_dim_0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64, device=device) ) y = torch.scatter(input, 0, index, src) return y - @autotest() + @autotest(n=5) def test_scatter_random_data_at_dim_1(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[1, 0], [0, 1]]), dtype=torch.int64, device=device) ) y = torch.scatter(input, 1, index, src) return y - @autotest() + @autotest(n=5) def test_scatter_scalar_random_data_at_dim0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64, device=device) ) y = torch.scatter(input, 0, index, 3.14) return y - @autotest() + @autotest(n=5) def test_scatter_scalar_random_data_at_dim1(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[1, 0], [0, 1]]), dtype=torch.int64, device=device) ) y = torch.scatter(input, 1, index, 3.14) return y - @autotest() + @autotest(n=5) def test_scatter_add_random_data_at_dim0(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[1, 0], [0, 1]]), dtype=torch.int64, device=device) ) y = torch.scatter_add(input, 0, index, src) return y - @autotest() + @autotest(n=5) def test_scatter_add_random_data_at_dim1(test_case): device = random_device() input = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) src = random_pytorch_tensor(ndim=2, dim0=2, dim1=2).to(device) - index = random_pytorch_tensor(ndim=2, dim0=2, dim1=2, high=2, dtype=int).to( - device + index = constant( + torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64, device=device) ) y = torch.scatter_add(input, 1, index, src) return y From fa051ae014114c9c014aca11f5636cf717b7de84 Mon Sep 17 00:00:00 2001 From: YaoChi Date: Sat, 31 Jul 2021 12:51:42 +0800 Subject: [PATCH 82/82] fix scatter update like expr for dim gather backward --- oneflow/core/autograd/gradient_funcs/dim_gather.cpp | 2 +- oneflow/core/framework/op_expr_helper.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/oneflow/core/autograd/gradient_funcs/dim_gather.cpp b/oneflow/core/autograd/gradient_funcs/dim_gather.cpp index 09a2d475de7..4ae5b63e960 100644 --- a/oneflow/core/autograd/gradient_funcs/dim_gather.cpp +++ b/oneflow/core/autograd/gradient_funcs/dim_gather.cpp @@ -72,7 +72,7 @@ Maybe DimGather::Apply(const DimGatherInterpState* ctx, const TensorTuple& MutableAttrMap attrs; JUST(attrs.SetAttr("dim", ctx->dim)); in_grads->at(0) = JUST( - OpInterpUtil::Dispatch(*bw_dim_gather_op_, {like, out_grads.at(0), index}, attrs)); + OpInterpUtil::Dispatch(*bw_dim_gather_op_, {like, index, out_grads.at(0)}, attrs)); return Maybe::Ok(); } diff --git a/oneflow/core/framework/op_expr_helper.cpp b/oneflow/core/framework/op_expr_helper.cpp index 77828308749..befb4061221 100644 --- a/oneflow/core/framework/op_expr_helper.cpp +++ b/oneflow/core/framework/op_expr_helper.cpp @@ -647,8 +647,8 @@ Maybe DimScatterAddLikeOp(const int32_t dim) { Maybe DimScatterAddLikeOp(const int32_t dim, const std::string& name) { return one::OpBuilder("dim_scatter_add_like", name) .Input("like") - .Input("input") .Input("index") + .Input("src") .Output("output") .Attr("dim", dim) .Build();