Lcy logsoftmax (#5746)

* first commit * for head * cuda * complete * rename * revise * space line * solve segment error * review1 * repeat * auto format by CI * rm si file * rm files * rm space * rm space1 * rm space1 * auto format by CI Co-authored-by: mu <702572275@qq.com> Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
Oneflow-Inc · Aug 13, 2021 · d3ca591 · d3ca591
1 parent c071635
commit d3ca591
Show file tree

Hide file tree

Showing 10 changed files with 701 additions and 6 deletions.
diff --git a/oneflow/core/autograd/gradient_funcs/logsoftmax.cpp b/oneflow/core/autograd/gradient_funcs/logsoftmax.cpp
@@ -0,0 +1,82 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/framework/op_expr_helper.h"
+#include "oneflow/core/framework/user_op_conf_trait.h"
+
+namespace oneflow {
+namespace one {
+
+struct LogSoftmaxInterpState : public OpExprInterpState {
+  bool requires_grad;
+};
+
+class LogSoftmax : public OpExprGradFunction<LogSoftmaxInterpState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(LogSoftmaxInterpState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const LogSoftmaxInterpState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+
+ private:
+  AttrMap base_attrs_;
+  std::shared_ptr<OpExpr> grad_op_;
+};
+
+Maybe<void> LogSoftmax::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);
+  const std::string& op_name = fw_op_expr->op_name();
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  grad_op_ = JUST(one::OpBuilder("logsoftmax_grad", GradientOpName(op_name))
+                      .Input("prob")
+                      .Input("dy")
+                      .Output("dx")
+                      .Build());
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> LogSoftmax::Capture(LogSoftmaxInterpState* ctx, const TensorTuple& inputs,
+                                const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  CHECK_EQ_OR_RETURN(inputs.size(), 1);
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+
+  if (!ctx->requires_grad) return Maybe<void>::Ok();
+
+  ctx->SaveTensorForBackward(outputs.at(1));
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> LogSoftmax::Apply(const LogSoftmaxInterpState* ctx, const TensorTuple& out_grads,
+                              TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) return Maybe<void>::Ok();
+  CHECK_EQ_OR_RETURN(out_grads.size(), 2);
+  const auto& dy = out_grads.at(0);
+  const auto& prob = ctx->SavedTensors().at(0);
+  in_grads->resize(1);
+  in_grads->at(0) = JUST(OpInterpUtil::Dispatch<Tensor>(*grad_op_, {prob, dy}));
+  return Maybe<void>::Ok();
+}
+
+REGISTER_OP_EXPR_GRAD_FUNCTION("logsoftmax", LogSoftmax);
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
@@ -232,6 +232,10 @@
   signature: "Tensor Softmax(Tensor x)"
   bind_python: True
 
+- name: "logsoftmax"
+  signature: "Tensor LogSoftmax(Tensor x)"
+  bind_python: True
+
 - name: "hardswish"
   signature: "Tensor HardSwish(Tensor x)"
   bind_python: True

diff --git a/oneflow/core/functional/impl/activation_functor.cpp b/oneflow/core/functional/impl/activation_functor.cpp
@@ -186,6 +186,19 @@ class SoftmaxFunctor : public UnaryFunctor {
   }
 };
 
+class LogSoftmaxFunctor : public UnaryFunctor {
+ public:
+  LogSoftmaxFunctor() {
+    op_ = CHECK_JUST(one::OpBuilder("logsoftmax").Input("in").Output("out").Output("prob").Build());
+  }
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& logits) const {
+    return OpInterpUtil::Dispatch<Tensor>(*op_, {logits});
+  }
+
+ private:
+  std::shared_ptr<OpExpr> op_;
+};
+
 class HardSwishFunctor : public UnaryFunctor {
  public:
   HardSwishFunctor() {
@@ -297,6 +310,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::HardSigmoidFunctor>("HardSigmoid");
   m.add_functor<impl::HardSigmoidGradFunctor>("HardSigmoidGrad");
   m.add_functor<impl::SoftmaxFunctor>("Softmax");
+  m.add_functor<impl::LogSoftmaxFunctor>("LogSoftmax");
   m.add_functor<impl::HardSwishFunctor>("HardSwish");
   m.add_functor<impl::HardSwishGradFunctor>("HardSwishGrad");
   m.add_functor<impl::LeakyReluFunctor>("LeakyRelu");

diff --git a/oneflow/user/kernels/logsoftmax_kernel.cpp b/oneflow/user/kernels/logsoftmax_kernel.cpp
@@ -0,0 +1,118 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/softmax_kernel_util.h"
+#include "oneflow/user/kernels/logsoftmax_kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<DeviceType device_type, typename T>
+class LogSoftmaxKernel final : public user_op::OpKernel {
+ public:
+  LogSoftmaxKernel() = default;
+  ~LogSoftmaxKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int64_t num_classes = in->shape().At(in->shape().NumAxes() - 1);
+    const int64_t num_instances = in->shape().Count(0, in->shape().NumAxes() - 1);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const size_t temp_storage_bytes = tmp_buffer->shape().elem_cnt();
+
+    LogSoftmaxKernelUtil<device_type, T>::ComputeOut(
+        ctx->device_ctx(), num_instances, num_classes, in->dptr<T>(), prob->mut_dptr<T>(),
+        out->mut_dptr<T>(), tmp_buffer->mut_dptr(), temp_storage_bytes);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const Shape& in_shape = ctx->InputShape("in", 0);
+    const int64_t num_classes = in_shape.At(in_shape.NumAxes() - 1);
+    const int64_t num_instances = in_shape.Count(0, in_shape.NumAxes() - 1);
+    return LogSoftmaxKernelUtil<device_type, T>::GetComputeProbTempStorageSizeInBytes(num_instances,
+                                                                                      num_classes);
+  };
+}
+
+#define REGISTER_LOGSOFTMAX_KERNEL(device, dtype)                                        \
+  REGISTER_USER_KERNEL("logsoftmax")                                                     \
+      .SetCreateFn<LogSoftmaxKernel<device, dtype>>()                                    \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                               \
+                       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)   \
+                       & (user_op::HobDataType("prob", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<device, dtype>());
+
+REGISTER_LOGSOFTMAX_KERNEL(DeviceType::kCPU, float)
+REGISTER_LOGSOFTMAX_KERNEL(DeviceType::kCPU, double)
+
+template<DeviceType device_type, typename T>
+class LogSoftmaxGradKernel final : public user_op::OpKernel {
+ public:
+  LogSoftmaxGradKernel() = default;
+  ~LogSoftmaxGradKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const int64_t num_classes = prob->shape().At(prob->shape().NumAxes() - 1);
+    const int64_t num_instances = prob->shape().elem_cnt() / num_classes;
+
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const size_t temp_storage_bytes = tmp_buffer->shape().elem_cnt();
+
+    LogSoftmaxKernelUtil<device_type, T>::ComputeDiff(
+        ctx->device_ctx(), num_instances, num_classes, dy->dptr<T>(), prob->dptr<T>(),
+        dx->mut_dptr<T>(), tmp_buffer->mut_dptr(), temp_storage_bytes);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const Shape& dy_shape = ctx->InputShape("dy", 0);
+    const int64_t num_classes = dy_shape.At(dy_shape.NumAxes() - 1);
+    const int64_t num_instances = dy_shape.Count(0, dy_shape.NumAxes() - 1);
+    return LogSoftmaxKernelUtil<device_type, T>::GetComputeDiffTempStorageSizeInBytes(num_instances,
+                                                                                      num_classes);
+  };
+}
+
+#define REGISTER_LOGSOFTMAX_GRAD_KERNEL(device, dtype)                                 \
+  REGISTER_USER_KERNEL("logsoftmax_grad")                                              \
+      .SetCreateFn<LogSoftmaxGradKernel<device, dtype>>()                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                             \
+                       & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<device, dtype>());
+
+REGISTER_LOGSOFTMAX_GRAD_KERNEL(DeviceType::kCPU, float)
+REGISTER_LOGSOFTMAX_GRAD_KERNEL(DeviceType::kCPU, double)
+
+}  // namespace
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/logsoftmax_kernel.cu b/oneflow/user/kernels/logsoftmax_kernel.cu
@@ -0,0 +1,120 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/cuda/softmax.cuh"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/logsoftmax_kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<DeviceType device_type, typename T>
+class LogSoftmaxKernel final : public user_op::OpKernel {
+ public:
+  LogSoftmaxKernel() = default;
+  ~LogSoftmaxKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const int64_t num_classes = in->shape().At(in->shape().NumAxes() - 1);
+    const int64_t num_instances = in->shape().Count(0, in->shape().NumAxes() - 1);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const size_t temp_storage_bytes = tmp_buffer->shape().elem_cnt();
+
+    LogSoftmaxKernelUtil<device_type, T>::ComputeOut(
+        ctx->device_ctx(), num_instances, num_classes, in->dptr<T>(), prob->mut_dptr<T>(),
+        out->mut_dptr<T>(), tmp_buffer->mut_dptr(), temp_storage_bytes);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const Shape& in_shape = ctx->InputShape("in", 0);
+    const int64_t num_classes = in_shape.At(in_shape.NumAxes() - 1);
+    const int64_t num_instances = in_shape.Count(0, in_shape.NumAxes() - 1);
+    return LogSoftmaxKernelUtil<device_type, T>::GetComputeProbTempStorageSizeInBytes(num_instances,
+                                                                                      num_classes);
+  };
+}
+
+#define REGISTER_LOGSOFTMAX_KERNEL(device, dtype)                                        \
+  REGISTER_USER_KERNEL("logsoftmax")                                                     \
+      .SetCreateFn<LogSoftmaxKernel<device, dtype>>()                                    \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                               \
+                       & (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)   \
+                       & (user_op::HobDataType("prob", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<device, dtype>());
+
+REGISTER_LOGSOFTMAX_KERNEL(DeviceType::kGPU, half)
+REGISTER_LOGSOFTMAX_KERNEL(DeviceType::kGPU, float)
+REGISTER_LOGSOFTMAX_KERNEL(DeviceType::kGPU, double)
+
+template<DeviceType device_type, typename T>
+class LogSoftmaxGradKernel final : public user_op::OpKernel {
+ public:
+  LogSoftmaxGradKernel() = default;
+  ~LogSoftmaxGradKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0);
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const int64_t num_classes = prob->shape().At(prob->shape().NumAxes() - 1);
+    const int64_t num_instances = prob->shape().elem_cnt() / num_classes;
+
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    const size_t temp_storage_bytes = tmp_buffer->shape().elem_cnt();
+
+    LogSoftmaxKernelUtil<device_type, T>::ComputeDiff(
+        ctx->device_ctx(), num_instances, num_classes, dy->dptr<T>(), prob->dptr<T>(),
+        dx->mut_dptr<T>(), tmp_buffer->mut_dptr(), temp_storage_bytes);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const Shape& dy_shape = ctx->InputShape("dy", 0);
+    const int64_t num_classes = dy_shape.At(dy_shape.NumAxes() - 1);
+    const int64_t num_instances = dy_shape.Count(0, dy_shape.NumAxes() - 1);
+    return LogSoftmaxKernelUtil<device_type, T>::GetComputeDiffTempStorageSizeInBytes(num_instances,
+                                                                                      num_classes);
+  };
+}
+
+#define REGISTER_LOGSOFTMAX_GRAD_KERNEL(device, dtype)                                 \
+  REGISTER_USER_KERNEL("logsoftmax_grad")                                              \
+      .SetCreateFn<LogSoftmaxGradKernel<device, dtype>>()                              \
+      .SetIsMatchedHob((user_op::HobDeviceTag() == device)                             \
+                       & (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
+      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<device, dtype>());
+
+REGISTER_LOGSOFTMAX_GRAD_KERNEL(DeviceType::kGPU, half)
+REGISTER_LOGSOFTMAX_GRAD_KERNEL(DeviceType::kGPU, float)
+REGISTER_LOGSOFTMAX_GRAD_KERNEL(DeviceType::kGPU, double)
+
+}  // namespace
+
+}  // namespace oneflow