PaddlePaddle · sweetsky0901 · Nov 20, 2017 · Nov 11, 2017 · Nov 11, 2017 · Nov 11, 2017
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
@@ -162,6 +162,7 @@ set(DEPS_OPS
     softmax_with_cross_entropy_op
     sum_op
     pool_op
+    maxout_op
     pool_with_index_op
     conv_op
     lstm_op
@@ -182,6 +183,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(conv_op DEPS vol2col)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
+op_library(maxout_op DEPS maxouting)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)

diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
@@ -14,6 +14,7 @@ if(WITH_GPU)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,6 +27,7 @@ else()
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
     cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors are in NCHW format.
+ * groups mustbe > 1
+ */
+template <typename MaxOutProcess, typename T>
+class MaxOutFunctor<platform::CPUPlace, MaxOutProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * output,
+                  int groups,
+                  MaxOutProcess maxout_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+
+    int fea_size = input_height * input_width;
+    // c_size mean output one batch size
+    int c_size = fea_size * output_channels;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex =  c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = maxout_process.initial();
+          for (int ph = 0; ph < groups; ++ph) {
+            maxout_process.compute(ele,
+              input_data[(new_bindex+new_cindex) * groups+ph*fea_size+f]);
+          }
+          output_data[(new_bindex+new_cindex+f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+
+
+template <class T>
+class MaxOutGradFunctor<platform::CPUPlace, T> {
+public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+
+    int fea_size = input_height * input_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx = 0;
+          bool stop = false;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && !stop; ++g) {
+              input_idx = (blen + clen) * groups + fea_size * g + f;
+              input_grad_data[input_idx] = 0;
+              if (input_data[input_idx] == output_data[output_idx]) {
+                input_grad_data[input_idx] += output_grad_data[output_idx];
+                stop = true;
+              }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<platform::CPUPlace, float>;
+template class MaxOutGradFunctor<platform::CPUPlace, double>;
+template class MaxOutFunctor<platform::CPUPlace,
+                             math::MaxOut<float>, float>;
+template class MaxOutFunctor<platform::CPUPlace,
+                             math::MaxOut<double>, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename MaxOutProcess, typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                            const int channels,
+                             const int input_height, const int input_width,
+                             int groups, T* output_data,
+                             MaxOutProcess maxout_process) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+              index += blockDim.x * gridDim.x) {
+    int batch_idx = index / size;
+    int batch_offset = index % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = maxout_process.initial();
+    for (int g = 0; g < groups; ++g) {
+      maxout_process.compute(ele, input_data[data_idx + g * feat_len]);
+    }
+    output_data[index] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, int groups) {
+    const int size = input_height * input_width * channels / groups;
+    const int feat_len = input_height * input_width;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+         index += blockDim.x * gridDim.x) {
+      int batch_idx = index / size;
+      int batch_offset = index % size;
+      int channel_idx = batch_offset / feat_len;
+      int feat_idx = batch_offset % feat_len;
+      int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+      int maxIndex = -1;
+      bool stop = false;
+      for (int g = 0; g < groups && !stop; ++g) {
+        if (input_data[data_idx + g * feat_len] == output_data[index]) {
+          maxIndex = data_idx + g * feat_len;
+          stop = true;
+        }
+      }
+      if (maxIndex != -1) {
+        // atomic add
+        platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+      }
+    }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename MaxOutProcess, typename T>
+class MaxOutFunctor<platform::GPUPlace, MaxOutProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor * output,
+                  int groups,
+                  MaxOutProcess maxout_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads =  output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxOut<
+        MaxOutProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, input_channels,
+                              input_height, input_width, groups,
+                              output_data, maxout_process);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    int nthreads =  output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxoutGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
+  }
+};
+
+template class MaxOutGradFunctor<platform::GPUPlace, float>;
+template class MaxOutGradFunctor<platform::GPUPlace, double>;
+
+template class MaxOutFunctor<platform::GPUPlace,
+                             math::MaxOut<float>, float>;
+template class MaxOutFunctor<platform::GPUPlace,
+                             math::MaxOut<double>, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+
+#define FLT_MAX \
+    __FLT_MAX__
+
+/*
+ * \brief Extracting simple operations from maxout.
+ *        need "initial", "compute"
+ * operation.
+ */
+template <class T>
+class MaxOut {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+};
+
+template <class T>
+class MaxOutGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+
+template <typename Place, typename MaxOutProcess, typename T>
+
+class MaxOutFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor * output,
+                  int groups, MaxOutProcess maxout_compute);
+};
+
+
+template <typename Place, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups);
+};
+
+
+
+
+
+
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle