Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cumprod #7278

Merged
merged 24 commits into from
Jan 24, 2022
Merged

Cumprod #7278

Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/oneflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,5 +157,6 @@ oneflow
read_onerec,
from_numpy,
cumsum,
cumprod,

.. autofunction:: oneflow.relu
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,48 @@ class CumsumGrad : public OpExprGradFunction<CumsumCaptureState> {

REGISTER_OP_EXPR_GRAD_FUNCTION("cumsum", CumsumGrad);

struct CumProdCaptureState : public AutoGradCaptureState {
bool requires_grad = false;
int64_t dim = 0;
};

class CumProdGrad : public OpExprGradFunction<CumProdCaptureState> {
public:
Maybe<void> Init(const OpExpr& op) override {
const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
return Maybe<void>::Ok();
}

Maybe<void> Capture(CumProdCaptureState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }

ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->dim = JUST(composed_attrs.GetAttr<int64_t>("dim"));
ctx->SaveTensorForBackward(outputs.at(0));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}

Maybe<void> Apply(const CumProdCaptureState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
in_grads->resize(1);
if (ctx->requires_grad) {
in_grads->at(0) = JUST(functional::CumProdGrad(out_grads.at(0), ctx->SavedTensors().at(0),
ctx->SavedTensors().at(1), ctx->dim));
}
return Maybe<void>::Ok();
}

private:
AttrMap base_attrs_;
};

REGISTER_OP_EXPR_GRAD_FUNCTION("cumprod", CumProdGrad);
} // namespace one
} // namespace oneflow
8 changes: 8 additions & 0 deletions oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1914,3 +1914,11 @@
- name: "cumsum_grad"
signature: "Tensor (Tensor input, Int64 dim) => CumsumGrad"
bind_python: False

- name: "cumprod"
signature: "Tensor (Tensor input, Int64 dim) => CumProd"
bind_python: True

- name: "cumprod_grad"
signature: "Tensor (Tensor input, Tensor y, Tensor x, Int64 dim) => CumProdGrad"
bind_python: False
48 changes: 48 additions & 0 deletions oneflow/core/functional/impl/math_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1819,6 +1819,52 @@ class CumsumGradFunctor {
private:
std::shared_ptr<OpExpr> op_;
};

class CumProdFunctor {
liufengwei0103 marked this conversation as resolved.
Show resolved Hide resolved
public:
CumProdFunctor() { op_ = CHECK_JUST(one::OpBuilder("cumprod").Input("x").Output("y").Build()); }
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
auto ndim = input->ndim();
if (dim < 0) { dim += ndim; }
CHECK_OR_RETURN(dim >= 0 && dim < ndim)
<< "IndexError: Dimension out of range (expected to be in range of [" << -ndim << ","
<< ndim << " ) but got " << dim;

MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
TensorProcessor tensor_processor;
JUST(tensor_processor.AddInputs({input}, DType::Int64()).Apply());
TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
return OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};

class CumProdGradFunctor {
public:
CumProdGradFunctor() {
op_ = CHECK_JUST(one::OpBuilder("cumprod_grad")
.Input("dy")
.Input("output")
.Input("input")
.Output("dx")
.Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& y,
const std::shared_ptr<one::Tensor>& x, int64_t dim) const {
// No need to check dim validation here, while CumProbFunctor handled already
liufengwei0103 marked this conversation as resolved.
Show resolved Hide resolved
MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, y, x}, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};

} // namespace impl

using namespace impl;
Expand Down Expand Up @@ -1890,6 +1936,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
m.add_functor<ErfinvInplaceFunctor>("ErfinvInplace");
m.add_functor<CumsumFunctor>("Cumsum");
m.add_functor<CumsumGradFunctor>("CumsumGrad");
m.add_functor<CumProdFunctor>("CumProd");
m.add_functor<CumProdGradFunctor>("CumProdGrad");
};

} // namespace functional
Expand Down
34 changes: 34 additions & 0 deletions oneflow/ir/include/OneFlow/OneFlowUserOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -4007,6 +4007,40 @@ def OneFlow_CumsumGradOp : OneFlow_BaseOp<"cumsum_grad", [NoSideEffect, DeclareO
let has_data_type_infer_fn = 1;
}

def OneFlow_CumProdOp : OneFlow_BaseOp<"cumprod", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:$x
);
let output = (outs
OneFlow_Tensor:$y
);
let attrs = (ins
SI64Attr:$dim
);
let has_logical_tensor_desc_infer_fn = 1;
let has_physical_tensor_desc_infer_fn = 1;
let has_get_sbp_fn = 1;
let has_data_type_infer_fn = 1;
}

def OneFlow_CumProdGradOp : OneFlow_BaseOp<"cumprod_grad", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:$dy,
OneFlow_Tensor:$output,
OneFlow_Tensor:$input
);
let output = (outs
OneFlow_Tensor:$dx
);
let attrs = (ins
SI64Attr:$dim
);
let has_logical_tensor_desc_infer_fn = 1;
let has_physical_tensor_desc_infer_fn = 1;
let has_get_sbp_fn = 1;
let has_data_type_infer_fn = 1;
}

def OneFlow_ErfInvOp : OneFlow_BaseOp<"erfinv", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:$x
Expand Down
213 changes: 213 additions & 0 deletions oneflow/user/kernels/cum_backward_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/user/kernels/cum_kernel.h"

namespace oneflow {
namespace {
template<typename T>
void CumSumBackward(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
int64_t cs_down_space, int64_t elem_cnt) {
auto* tmp_in_ptr_base = in_ptr;
auto* tmp_out_ptr_base = out_ptr;
auto step = cs_space * cs_down_space;
for (auto i = 0; i < cs_up_space; i++) {
for (auto j = 0; j < cs_space; j++) {
auto* tmp_in_ptr = tmp_in_ptr_base + j * cs_down_space;
auto* tmp_out_ptr = tmp_out_ptr_base + j * cs_down_space;
std::fill_n(tmp_out_ptr, cs_down_space, cs_space - j);
for (auto k = 0; k < cs_down_space; k++) { tmp_out_ptr[k] *= tmp_in_ptr[k]; }
}
tmp_in_ptr_base += step;
tmp_out_ptr_base += step;
}
}

// O(n) cumprod backward, formula: cumsum(flip(dY * Y)) / X.
// Need to take care when there is at least a zero in the input.
template<typename T>
void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr,
const int64_t up_space, const int64_t space, const int64_t down_space,
const int64_t elem_cnt) {
const auto step = space * down_space;
for (size_t i = 0; i < up_space; i++) {
// two-dims buffer for 0 elem index
std::vector<size_t> cumsum_zeros_number(space * down_space, 0);
auto* cumsum_zeros_number_ptr = cumsum_zeros_number.data();
liufengwei0103 marked this conversation as resolved.
Show resolved Hide resolved
for (size_t j = 0; j < space; j++) {
const size_t ptr_offset = j * down_space;
auto* tmp_input_ptr = input_ptr + ptr_offset;
auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + ptr_offset;
auto* last_tmp_cumsum_zeros_number_ptr = tmp_cumsum_zeros_number_ptr - down_space;
for (auto k = 0; k < down_space; k++) {
int is_zero = tmp_input_ptr[k] == 0 ? 1 : 0;
tmp_cumsum_zeros_number_ptr[k] =
is_zero + (j == 0 ? 0 : last_tmp_cumsum_zeros_number_ptr[k]);
}
}
{
// for k < z(z is first zero index)
std::vector<T> reverse_cumsum(down_space, 0);
for (size_t j = 0; j < space; j++) {
const size_t ptr_offset = (space - j - 1) * down_space;
auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + ptr_offset;
auto* tmp_dy_ptr = dy_ptr + ptr_offset;
auto* tmp_dx_ptr = dx_ptr + ptr_offset;
auto* tmp_output_ptr = output_ptr + ptr_offset;
auto* tmp_input_ptr = input_ptr + ptr_offset;
for (auto k = 0; k < down_space; k++) {
if (tmp_cumsum_zeros_number_ptr[k] > 0) { continue; }
reverse_cumsum[k] += tmp_output_ptr[k] * tmp_dy_ptr[k];
tmp_dx_ptr[k] = reverse_cumsum[k] / tmp_input_ptr[k];
}
}
}
{
// for k == z
std::vector<size_t> first_zero(down_space, space);
for (size_t j = 0; j < space; j++) {
auto* tmp_cumsum_zeros_number_ptr = cumsum_zeros_number_ptr + j * down_space;
for (size_t k = 0; k < down_space; k++) {
if (tmp_cumsum_zeros_number_ptr[k] == 1 && first_zero[k] == space) { first_zero[k] = j; }
}
}
// compute along row
std::vector<T> cumsum_buffer(down_space, 0);
for (size_t k = 0; k < down_space; k++) {
auto* tmp_input_down_offset_ptr = input_ptr + k;
auto* tmp_output_down_offset_ptr = output_ptr + k;
auto* tmp_dy_down_offset_ptr = dy_ptr + k;
auto* tmp_cumsum_zero_number_down_offset_ptr = cumsum_zeros_number_ptr + k;

size_t first_zero_index = first_zero[k];
if (first_zero_index == space) { continue; }
auto cumprod_before_first_zero =
first_zero_index == 0
? 1
: *(tmp_output_down_offset_ptr + (first_zero_index - 1) * down_space);
auto cumprod = 1;
for (size_t j = first_zero_index; j < space; j++) {
const size_t ptr_offset = j * down_space;
auto tmp_dy = *(tmp_dy_down_offset_ptr + ptr_offset);
auto tmp_input = *(tmp_input_down_offset_ptr + ptr_offset);
auto tmp_cumsum_zero_number = *(tmp_cumsum_zero_number_down_offset_ptr + ptr_offset);
if (tmp_cumsum_zero_number != 1) { continue; }
if (j != first_zero_index) { cumprod *= tmp_input; }
cumsum_buffer[k] += cumprod_before_first_zero * tmp_dy * cumprod;
}
}
for (size_t j = 0; j < down_space; j++) {
*(dx_ptr + first_zero[j] * down_space) = cumsum_buffer[j];
}
}

input_ptr += step;
output_ptr += step;
dy_ptr += step;
dx_ptr += step;
}
}
} // namespace

class CpuCumGradKernel : public user_op::OpKernel {
public:
CpuCumGradKernel() = default;
~CpuCumGradKernel() = default;

private:
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};

template<typename T>
class CpuCumsumGradKernel final : public CpuCumGradKernel {
public:
CpuCumsumGradKernel() = default;
~CpuCumsumGradKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
auto elem_cnt = dy->shape().elem_cnt();
auto dim = ctx->Attr<int64_t>("dim");
const auto* dy_ptr = dy->dptr<T>();
auto* dx_ptr = dx->mut_dptr<T>();

// take cumsum's abbreviation as `cs`
// data partition: cs_up_space|cs_space|cs_down_space
auto cs_up_space = elem_cnt / dx->shape().Count(dim);
auto cs_space = dx->shape().At(dim);
auto cs_down_space = dx->shape().Count(dim + 1);

CumSumBackward(dy_ptr, dx_ptr, cs_up_space, cs_space, cs_down_space, elem_cnt);
}
};

#define REGISTER_CPU_CUMSUM_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumsum_grad") \
.SetCreateFn<CpuCumsumGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));

REGISTER_CPU_CUMSUM_GRAD_KERNEL(float)
REGISTER_CPU_CUMSUM_GRAD_KERNEL(double)
#undef REGISTER_CPU_CUMSUM_GRAD_KERNEL

template<typename T>
class CpuCumProdGradKernel final : public CpuCumGradKernel {
public:
CpuCumProdGradKernel() = default;
~CpuCumProdGradKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* output = ctx->Tensor4ArgNameAndIndex("output", 0);
const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
const int64_t elem_cnt = dy->shape().elem_cnt();
if (elem_cnt == 0) { return; }

const auto* output_ptr = output->dptr<T>();
const auto* input_ptr = input->dptr<T>();
const auto* dy_ptr = dy->dptr<T>();
auto* dx_ptr = dx->mut_dptr<T>();

// data partition: up_space|space|down_space
auto dim = ctx->Attr<int64_t>("dim");
auto up_space = elem_cnt / dx->shape().Count(dim);
auto space = dx->shape().At(dim);
auto down_space = dx->shape().Count(dim + 1);
if (space == 1) {
Memcpy<DeviceType::kCPU>(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T));
return;
}
CumProdBackward(dy_ptr, dx_ptr, output_ptr, input_ptr, up_space, space, down_space, elem_cnt);
}
};

#define REGISTER_CPU_CUMPROD_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumprod_grad") \
.SetCreateFn<CpuCumProdGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));

REGISTER_CPU_CUMPROD_GRAD_KERNEL(float)
REGISTER_CPU_CUMPROD_GRAD_KERNEL(double)
#undef REGISTER_CPU_CUMPROD_GRAD_KERNEL

} // namespace oneflow
Loading