From 48e0f432537a97e915306601a8b5c8b72a77b6d1 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 12 Jun 2017 21:22:15 +0800
Subject: [PATCH 01/20] Add ImageExpandFunction.

---
 paddle/function/GemmConvOp.h      |  84 +++++++++++++++
 paddle/function/ImageExpandOp.cpp | 164 ++++++++++++++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 paddle/function/GemmConvOp.h
 create mode 100644 paddle/function/ImageExpandOp.cpp
diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
new file mode 100644
index 0000000000000..25d2e220bfb93
--- /dev/null
+++ b/paddle/function/GemmConvOp.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// #include "ConvOp.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into a colData.
+ *        Then you can reshape colData to a convolution matrix for
+ *        convolution calculation based on matrix multiplication.
+ *
+ * \param imData  Image data of NCHW format.
+ *                The format of imData is:
+ *                [input_channels, input_height, input_width].
+ * \param colData colData data.
+ *                If the template argument Format is kCFO,
+ *                the format of colData is:
+ *                [input_channels,
+ *                 filter_height,
+ *                 filter_width,
+ *                 output_height,
+ *                 output_width]
+ *                If the template argument Format is kOCF,
+ *                the format of colData is:
+ *                [output_height,
+ *                 output_width,
+ *                 input_channels,
+ *                 filter_height,
+ *                 filter_width]
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
new file mode 100644
index 0000000000000..426b6c8e312e4
--- /dev/null
+++ b/paddle/function/ImageExpandOp.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "GemmConvOp.h"
+
+namespace paddle {
+
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [output_height, output_width,
+ *            input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = T(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST). Where N is batch size,
+ *        S is the length of the sequence after each image is expanded,
+ *        T is the size of each time step in the sequence.
+ *
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ */
+template <DeviceType Device>
+class ImageExpandFunction : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& output = outputs[0].shape();
+    // input argument should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    // output argument should be 3-dimensional.
+    CHECK_EQ(output.ndims(), (size_t)3);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t seqLength = output[1];
+    size_t stepSize = output[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockH());
+
+    real* inputData = inputs[0].data<real>();
+    real* outputData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      im2col(inputData,
+             inputChannels,
+             inputHeight,
+             inputWidth,
+             blockH(),
+             blockW(),
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW(),
+             outputHeight,
+             outputWidth,
+             outputData);
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += seqLength * stepSize;
+    }
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+}  // namespace paddle

From 61aa1098fd13339c5be752cd1dc8f0119296c966 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 10:51:52 +0800
Subject: [PATCH 02/20] BlockExpandLayer based on the ImageExpand Function.

---
 paddle/function/ImageExpandOp.cpp          |  9 ++-
 paddle/gserver/layers/BlockExpandLayer.cpp | 80 ++++++++++++++--------
 2 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 426b6c8e312e4..0c10f30bbd9c1 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -119,12 +119,17 @@ class ImageExpandFunction : public FunctionBase {
         1 +
         (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
     CHECK_EQ(seqLength, outputHeight * outputWidth);
-    CHECK_EQ(stepSize, inputChannels * blockH() * blockH());
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
 
     real* inputData = inputs[0].data<real>();
     real* outputData = outputs[0].data<real>();
     Im2ColFunctor<kOCF, Device, real> im2col;
     for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [output_height, output_width,
+      // input_channels, filter_height, filter_width], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
       im2col(inputData,
              inputChannels,
              inputHeight,
@@ -161,4 +166,6 @@ class ImageExpandFunction : public FunctionBase {
   inline int blockW() const { return blocks_[1]; }
 };
 
+REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction);
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb92158c5..9760d39bb4a3c 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,18 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  if (!useGpu_) {
+    std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+    std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+    std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+    createFunction(forward_,
+                   "ImageExpand",
+                   FuncConfig()
+                       .set("strides", strides)
+                       .set("paddings", paddings)
+                       .set("blocks", blocks));
+  }
+
   return true;
 }
 
@@ -63,10 +75,11 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
+  // TODO(hedaoyuan): After completing the GPU version of ImageExpand,
+  // refactor the following code.
   Argument& out = getOutput();
   MatrixPtr outV = getOutputValue();
 
@@ -78,38 +91,49 @@ void BlockExpandLayer::forward(PassType passType) {
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
+    if (useGpu_) {
+      outVTrans_->zeroMem();
+      /* expand each block as one row */
+      MatrixPtr inputTmp =
+          Matrix::create(input->getData() + i * input->getWidth(),
+                         1,
+                         input->getWidth(),
+                         false,
+                         useGpu_);
+      outVTrans_->convExpand(*inputTmp,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             blockH_,
+                             blockW_,
+                             strideH_,
+                             strideW_,
+                             paddingH_,
+                             paddingW_,
+                             outputH_,
+                             outputW_);
+      MatrixPtr outVTmp =
+          Matrix::create(outV->getData() + i * blockNum * blockSize,
+                         blockNum,
+                         blockSize,
+                         false,
+                         useGpu_);
+      outVTrans_->transpose(outVTmp, false);
+    }
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
   }
   start[batchSize] = batchSize * blockNum;
+  if (!useGpu_) {
+    TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+    TensorShape outputShape({batchSize, blockNum, blockSize});
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(0), inputShape);
+    outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {

From 2acb84fe70104980c902b252a26a526a3d943c2a Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 14:18:20 +0800
Subject: [PATCH 03/20] Add ImageExpandGrad Function.

---
 paddle/function/GemmConvOp.h               |   1 +
 paddle/function/ImageExpandOp.cpp          | 224 +++++++++++++++++----
 paddle/gserver/layers/BlockExpandLayer.cpp |  89 ++++----
 paddle/gserver/layers/BlockExpandLayer.h   |   3 +
 4 files changed, 237 insertions(+), 80 deletions(-)

diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
index 25d2e220bfb93..f724643f35af9 100644
--- a/paddle/function/GemmConvOp.h
+++ b/paddle/function/GemmConvOp.h
@@ -44,6 +44,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  *                 input_channels,
  *                 filter_height,
  *                 filter_width]
+ * TODO(hedaoyuan): Refactor the arguments of the interface with TensorShape.
  */
 template <ColFormat Format, DeviceType Device, class T>
 class Im2ColFunctor {
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 0c10f30bbd9c1..4d8c25ffcdafa 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -70,16 +70,67 @@ class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
   }
 };
 
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 /*
  * \brief Converts the image data of four dimensions(NCHW) into
- *        a sequence data of three dimensions(NST). Where N is batch size,
- *        S is the length of the sequence after each image is expanded,
- *        T is the size of each time step in the sequence.
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
  *
+ * Arguments in forward function:
  * \param inputs[0]  Image data of NCHW format.
  * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
  */
-template <DeviceType Device>
 class ImageExpandFunction : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
@@ -93,25 +144,27 @@ class ImageExpandFunction : public FunctionBase {
     numOutputs_ = 1;
   }
 
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& output = outputs[0].shape();
-    // input argument should be 4-dimensional.
-    CHECK_EQ(input.ndims(), (size_t)4);
-    // output argument should be 3-dimensional.
-    CHECK_EQ(output.ndims(), (size_t)3);
-    // The batchSize of the input needs to be equal to
-    // the batchSize of the output.
-    CHECK_EQ(input[0], output[0]);
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t seqLength = output[1];
-    size_t stepSize = output[2];
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  void check(const TensorShape& image, const TensorShape& sequence) {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
     size_t outputHeight =
         1 +
         (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
@@ -121,8 +174,59 @@ class ImageExpandFunction : public FunctionBase {
     CHECK_EQ(seqLength, outputHeight * outputWidth);
     CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
 
-    real* inputData = inputs[0].data<real>();
-    real* outputData = outputs[0].data<real>();
+    // [output_height, output_width,
+    // input_channels, filter_height, filter_width]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class ImageExpandForward : public ImageExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    ImageExpandFunction::init(config);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    check(image, sequence);
+
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight = colShape[0];
+    size_t outputWidth = colShape[1];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
     Im2ColFunctor<kOCF, Device, real> im2col;
     for (size_t i = 0; i < batchSize; i++) {
       // The result of im2col is [output_height, output_width,
@@ -130,7 +234,7 @@ class ImageExpandFunction : public FunctionBase {
       // reshape into [seqLength, stepSize], where seqLength is equal
       // output_height * output_width, stepSize is equal
       // input_channels * filter_height * filter_width
-      im2col(inputData,
+      im2col(imageData,
              inputChannels,
              inputHeight,
              inputWidth,
@@ -142,30 +246,64 @@ class ImageExpandFunction : public FunctionBase {
              paddingW(),
              outputHeight,
              outputWidth,
-             outputData);
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += seqLength * stepSize;
+             seqData);
+      imageData += inputChannels * inputHeight * inputWidth;
+      seqData += seqLength * stepSize;
     }
   }
+};
 
-protected:
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> blocks_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
+template <DeviceType Device>
+class ImageExpandBackward : public ImageExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    ImageExpandFunction::init(config);
+  }
 
-  inline int paddingW() const { return paddings_[1]; }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    check(image, sequence);
 
-  inline int blockH() const { return blocks_[0]; }
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight = colShape[0];
+    size_t outputWidth = colShape[1];
 
-  inline int blockW() const { return blocks_[1]; }
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(seqData,
+             inputChannels,
+             inputHeight,
+             inputWidth,
+             blockH(),
+             blockW(),
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW(),
+             outputHeight,
+             outputWidth,
+             imageData);
+      imageData += inputChannels * inputHeight * inputWidth;
+      seqData += seqLength * stepSize;
+    }
+  }
 };
 
-REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction);
+REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
+REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 9760d39bb4a3c..c8d0b21c8754d 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -47,6 +47,12 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
                        .set("strides", strides)
                        .set("paddings", paddings)
                        .set("blocks", blocks));
+    createFunction(backward_,
+                   "ImageExpandGrad",
+                   FuncConfig()
+                       .set("strides", strides)
+                       .set("paddings", paddings)
+                       .set("blocks", blocks));
   }
 
   return true;
@@ -126,12 +132,12 @@ void BlockExpandLayer::forward(PassType passType) {
   }
   start[batchSize] = batchSize * blockNum;
   if (!useGpu_) {
-    TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-    TensorShape outputShape({batchSize, blockNum, blockSize});
+    inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+    outputShape_ = TensorShape({batchSize, blockNum, blockSize});
     BufferArgs inputs;
     BufferArgs outputs;
-    inputs.addArg(*getInputValue(0), inputShape);
-    outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO);
+    inputs.addArg(*getInputValue(0), inputShape_);
+    outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
     forward_[0]->calc(inputs, outputs);
   }
 }
@@ -144,41 +150,50 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
   if (!preGrad) {
     return;
   }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
 
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
+  if (useGpu_) {
+    MatrixPtr grad = getOutputGrad();
+    MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
+    size_t batchSize = preGrad->getHeight();
 
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+    CHECK_EQ(batchSize * blockNum, grad->getHeight());
+    CHECK_EQ(blockSize, grad->getWidth());
+
+    for (size_t i = 0; i < batchSize; i++) {
+      MatrixPtr gradTmp =
+          Matrix::create(grad->getData() + i * blockNum * blockSize,
+                         blockNum,
+                         blockSize,
+                         false,
+                         useGpu_);
+      gradTmp->transpose(gradTrans, false);
+      MatrixPtr preGradTmp =
+          Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
+                         1,
+                         preGrad->getWidth(),
+                         false,
+                         useGpu_);
+      preGradTmp->convShrink(*gradTrans,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             blockH_,
+                             blockW_,
+                             strideH_,
+                             strideW_,
+                             paddingH_,
+                             paddingW_,
+                             outputH_,
+                             outputW_,
+                             1.0,
+                             1.0);
+    }
+  } else {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e60ec..edda0e0b630ae 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -53,6 +53,9 @@ class BlockExpandLayer : public Layer {
   /// auxiliary variable, which saves the transposed output value.
   MatrixPtr outVTrans_;
 
+  TensorShape inputShape_;
+  TensorShape outputShape_;
+
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
 

From 0672d330a3d4f55c54ce8568c974a10c02ba40cf Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 15:42:17 +0800
Subject: [PATCH 04/20] Use the TensorShape to reconstruct the arguments of the
 Im2ColFunctor and Col2ImFunctor interfaces.

---
 paddle/function/Im2Col.h          |  92 +++++++++++++++++++++++
 paddle/function/ImageExpandOp.cpp | 120 +++++++++++++-----------------
 2 files changed, 145 insertions(+), 67 deletions(-)
 create mode 100644 paddle/function/Im2Col.h

diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
new file mode 100644
index 0000000000000..d461ec7510b48
--- /dev/null
+++ b/paddle/function/Im2Col.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData  Image data of NCHW format.
+ *                The shape of imData is:
+ *                [inputChannels, inputHeight, inputWidth].
+ * \param colData colData data.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData                shape of sequence
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>    [seqLength, stepSize]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData                shape of sequence
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 4d8c25ffcdafa..ad34967bd6580 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -13,31 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Function.h"
-#include "GemmConvOp.h"
+#include "Im2Col.h"
 
 namespace paddle {
 
 /*
- * imData = [input_channels, input_height, input_width]
- * colData = [output_height, output_width,
- *            input_channels, filter_height, filter_width]
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
 public:
   void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
     for (int outputH = 0; outputH < outputHeight; ++outputH) {
       for (int outputW = 0; outputW < outputWidth; ++outputW) {
         for (int channel = 0; channel < inputChannels; ++channel) {
@@ -55,7 +57,7 @@ class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
                   filterW;
               if (imRowOffset < 0 || imRowOffset >= inputHeight ||
                   imColOffset < 0 || imColOffset >= inputWidth) {
-                colData[colDataOffset] = T(0);
+                colData[colDataOffset] = float(0);
               } else {
                 int imDataOffset =
                     (channel * inputHeight + imRowOffset) * inputWidth +
@@ -70,22 +72,29 @@ class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
   }
 };
 
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
 public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
     for (int outputH = 0; outputH < outputHeight; ++outputH) {
       for (int outputW = 0; outputW < outputWidth; ++outputW) {
         for (int channel = 0; channel < inputChannels; ++channel) {
@@ -146,7 +155,7 @@ class ImageExpandFunction : public FunctionBase {
 
   virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
 
-  void check(const TensorShape& image, const TensorShape& sequence) {
+  void check(const TensorShape& image, const TensorShape& sequence) const {
     // image shape should be 4-dimensional.
     CHECK_EQ(image.ndims(), (size_t)4);
     // sequence shape should be 3-dimensional.
@@ -159,7 +168,7 @@ class ImageExpandFunction : public FunctionBase {
   // Calculate the shape of colData based on the shape of the image
   // and the shape of the sequence.
   TensorShape getColShape(const TensorShape& image,
-                          const TensorShape& sequence) {
+                          const TensorShape& sequence) const {
     size_t inputChannels = image[1];
     size_t inputHeight = image[2];
     size_t inputWidth = image[3];
@@ -174,8 +183,7 @@ class ImageExpandFunction : public FunctionBase {
     CHECK_EQ(seqLength, outputHeight * outputWidth);
     CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
 
-    // [output_height, output_width,
-    // input_channels, filter_height, filter_width]
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
     return TensorShape({outputHeight,
                         outputWidth,
                         inputChannels,
@@ -215,40 +223,29 @@ class ImageExpandForward : public ImageExpandFunction {
     const TensorShape& sequence = outputs[0].shape();
     check(image, sequence);
 
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
     TensorShape colShape = getColShape(image, sequence);
     size_t batchSize = image[0];
-    size_t inputChannels = image[1];
-    size_t inputHeight = image[2];
-    size_t inputWidth = image[3];
-    size_t seqLength = sequence[1];
-    size_t stepSize = sequence[2];
-    size_t outputHeight = colShape[0];
-    size_t outputWidth = colShape[1];
 
     real* imageData = inputs[0].data<real>();
     real* seqData = outputs[0].data<real>();
     Im2ColFunctor<kOCF, Device, real> im2col;
     for (size_t i = 0; i < batchSize; i++) {
-      // The result of im2col is [output_height, output_width,
-      // input_channels, filter_height, filter_width], and it is easy to
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
       // reshape into [seqLength, stepSize], where seqLength is equal
       // output_height * output_width, stepSize is equal
       // input_channels * filter_height * filter_width
       im2col(imageData,
-             inputChannels,
-             inputHeight,
-             inputWidth,
-             blockH(),
-             blockW(),
+             imShape,
+             seqData,
+             colShape,
              strideH(),
              strideW(),
              paddingH(),
-             paddingW(),
-             outputHeight,
-             outputWidth,
-             seqData);
-      imageData += inputChannels * inputHeight * inputWidth;
-      seqData += seqLength * stepSize;
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
     }
   }
 };
@@ -270,35 +267,24 @@ class ImageExpandBackward : public ImageExpandFunction {
     const TensorShape& sequence = inputs[0].shape();
     check(image, sequence);
 
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
     TensorShape colShape = getColShape(image, sequence);
     size_t batchSize = image[0];
-    size_t inputChannels = image[1];
-    size_t inputHeight = image[2];
-    size_t inputWidth = image[3];
-    size_t seqLength = sequence[1];
-    size_t stepSize = sequence[2];
-    size_t outputHeight = colShape[0];
-    size_t outputWidth = colShape[1];
 
     real* imageData = outputs[0].data<real>();
     real* seqData = inputs[0].data<real>();
     Col2ImFunctor<kOCF, Device, real> col2im;
     for (size_t i = 0; i < batchSize; i++) {
-      col2im(seqData,
-             inputChannels,
-             inputHeight,
-             inputWidth,
-             blockH(),
-             blockW(),
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
              strideH(),
              strideW(),
              paddingH(),
-             paddingW(),
-             outputHeight,
-             outputWidth,
-             imageData);
-      imageData += inputChannels * inputHeight * inputWidth;
-      seqData += seqLength * stepSize;
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
     }
   }
 };

From 9c009b4087afa0ac61425cd9e45f8c2e60e92568 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 15:43:48 +0800
Subject: [PATCH 05/20] Remove GemmConvOp.h file.

---
 paddle/function/GemmConvOp.h | 85 ------------------------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 paddle/function/GemmConvOp.h

diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
deleted file mode 100644
index f724643f35af9..0000000000000
--- a/paddle/function/GemmConvOp.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// #include "ConvOp.h"
-
-namespace paddle {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of four dimensions(NCHW) into a colData.
- *        Then you can reshape colData to a convolution matrix for
- *        convolution calculation based on matrix multiplication.
- *
- * \param imData  Image data of NCHW format.
- *                The format of imData is:
- *                [input_channels, input_height, input_width].
- * \param colData colData data.
- *                If the template argument Format is kCFO,
- *                the format of colData is:
- *                [input_channels,
- *                 filter_height,
- *                 filter_width,
- *                 output_height,
- *                 output_width]
- *                If the template argument Format is kOCF,
- *                the format of colData is:
- *                [output_height,
- *                 output_width,
- *                 input_channels,
- *                 filter_height,
- *                 filter_width]
- * TODO(hedaoyuan): Refactor the arguments of the interface with TensorShape.
- */
-template <ColFormat Format, DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData);
-};
-
-template <ColFormat Format, DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData);
-};
-
-}  // namespace paddle

From 34362d938175a012841275849f3b8102d736b4c6 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 15:57:01 +0800
Subject: [PATCH 06/20] Fix some of the wrong comments in im2col.h file.

---
 paddle/function/Im2Col.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index d461ec7510b48..6d76e229bfc21 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -24,10 +24,11 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  *        five dimensions in the Im2ColFunctor calculation,
  *        And in the Col2ImFunctor calculation, it is reversed.
  *
- * \param imData  Image data of NCHW format.
- *                The shape of imData is:
- *                [inputChannels, inputHeight, inputWidth].
- * \param colData colData data.
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
  *
  * If the template argument Format is kCFO, the shape of colData is:
  * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
@@ -38,10 +39,10 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  * outputHeight * outputWidth.
  *
  * Reshape:
- *     shape of colData                shape of sequence
+ *     shape of colData           shape of convolution matrix
  *     [inputChannels,
  *      filterHeight,
- *      filterWidth,      ======>    [seqLength, stepSize]
+ *      filterWidth,      ======>      [height, width]
  *      outputHeight,
  *      outputWidth]
  *
@@ -53,7 +54,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  * inputChannels * filterHeight * filterWidth.
  *
  * Reshape:
- *     shape of colData                shape of sequence
+ *     shape of colData             shape of sequence matrix
  *     [outputHeight,
  *      outputWidth,
  *      inputChannels,    ======>    [seqLength, stepSize]

From 152bd2f9c867e8e165c3d22810281023880b3d16 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 20:30:02 +0800
Subject: [PATCH 07/20] Add the GPU version implementation of ImageExpand
 function.

---
 paddle/function/Im2Col.h                   |   3 +
 paddle/function/Im2ColOpGpu.cu             | 130 +++++++++++++++++++++
 paddle/function/ImageExpandOp.cpp          |   3 +
 paddle/gserver/layers/BlockExpandLayer.cpp |  73 ++++--------
 paddle/gserver/layers/BlockExpandLayer.h   |   3 -
 5 files changed, 156 insertions(+), 56 deletions(-)
 create mode 100644 paddle/function/Im2ColOpGpu.cu

diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 6d76e229bfc21..48e2e32f9256f 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "TensorShape.h"
+#include "TensorType.h"
+
 namespace paddle {
 
 /* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000000..1dac2585db721
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2colOCF(const T* imData, T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    int widthOffset = idx + swId * strideWidth - paddingWidth;
+    int heightOffset = idy + shId * strideHeight - paddingHeight;
+    int imOffset = widthOffset + heightOffset * inputWidth
+       + channelId * inputHeight * inputWidth;
+
+    int colOffset = idx + idy * filterWidth
+      + channelId * filterHeight * filterWidth
+      + (shId * outputWidth + swId)
+      * (inputChannels * filterHeight * filterWidth);
+
+    if (idx < filterWidth && idy < filterHeight) {
+      if (heightOffset >= inputHeight || heightOffset < 0 ||
+          widthOffset >= inputWidth || widthOffset < 0) {
+        colData[colOffset] = T(0);
+      } else {
+        colData[colOffset] = imData[imOffset];
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index ad34967bd6580..fe4c8fefcf5e8 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -291,5 +291,8 @@ class ImageExpandBackward : public ImageExpandFunction {
 
 REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
 REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward);
+#endif
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index c8d0b21c8754d..1889b347c2d89 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,16 +37,16 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "ImageExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
   if (!useGpu_) {
-    std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-    std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-    std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-    createFunction(forward_,
-                   "ImageExpand",
-                   FuncConfig()
-                       .set("strides", strides)
-                       .set("paddings", paddings)
-                       .set("blocks", blocks));
     createFunction(backward_,
                    "ImageExpandGrad",
                    FuncConfig()
@@ -84,62 +84,29 @@ void BlockExpandLayer::forward(PassType passType) {
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  // TODO(hedaoyuan): After completing the GPU version of ImageExpand,
-  // refactor the following code.
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    if (useGpu_) {
-      outVTrans_->zeroMem();
-      /* expand each block as one row */
-      MatrixPtr inputTmp =
-          Matrix::create(input->getData() + i * input->getWidth(),
-                         1,
-                         input->getWidth(),
-                         false,
-                         useGpu_);
-      outVTrans_->convExpand(*inputTmp,
-                             imgSizeH_,
-                             imgSizeW_,
-                             channels_,
-                             blockH_,
-                             blockW_,
-                             strideH_,
-                             strideW_,
-                             paddingH_,
-                             paddingW_,
-                             outputH_,
-                             outputW_);
-      MatrixPtr outVTmp =
-          Matrix::create(outV->getData() + i * blockNum * blockSize,
-                         blockNum,
-                         blockSize,
-                         false,
-                         useGpu_);
-      outVTrans_->transpose(outVTmp, false);
-    }
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
   }
   start[batchSize] = batchSize * blockNum;
-  if (!useGpu_) {
-    inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-    outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(0), inputShape_);
-    outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index edda0e0b630ae..15ce73ab8b2ca 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,9 +50,6 @@ class BlockExpandLayer : public Layer {
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
-
   TensorShape inputShape_;
   TensorShape outputShape_;
 

From f8ef8c174c442f14662a94e59fcda6587498c8a5 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 21:07:20 +0800
Subject: [PATCH 08/20] Add the GPU version implementation of ImageExpandGrad
 function.

---
 paddle/function/Im2ColOpGpu.cu             | 107 +++++++++++++++++----
 paddle/function/ImageExpandOp.cpp          |   1 +
 paddle/gserver/layers/BlockExpandLayer.cpp |  33 +++----
 3 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index 1dac2585db721..bddd8ffc7c0b4 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Im2Col.h"
+#include "hl_device_functions.cuh"
 
 namespace paddle {
 
@@ -25,30 +26,29 @@ void im2colOCF(const T* imData, T* colData,
                int strideHeight, int strideWidth,
                int paddingHeight, int paddingWidth,
                int outputHeight, int outputWidth) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
   int swId = blockIdx.x;
   int shId = blockIdx.y;
-
   for (int channelId = threadIdx.z;
        channelId < inputChannels;
        channelId += blockDim.z) {
-    int widthOffset = idx + swId * strideWidth - paddingWidth;
-    int heightOffset = idy + shId * strideHeight - paddingHeight;
-    int imOffset = widthOffset + heightOffset * inputWidth
-       + channelId * inputHeight * inputWidth;
-
-    int colOffset = idx + idy * filterWidth
-      + channelId * filterHeight * filterWidth
-      + (shId * outputWidth + swId)
-      * (inputChannels * filterHeight * filterWidth);
-
-    if (idx < filterWidth && idy < filterHeight) {
-      if (heightOffset >= inputHeight || heightOffset < 0 ||
-          widthOffset >= inputWidth || widthOffset < 0) {
-        colData[colOffset] = T(0);
-      } else {
-        colData[colOffset] = imData[imOffset];
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
       }
     }
   }
@@ -105,6 +105,41 @@ public:
   }
 };
 
+template<class T>
+__global__
+void col2imOCF(T* imData, const T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
 /*
  * imShape = [inputChannels, inputHeight, inputWidth]
  * colShape =
@@ -121,10 +156,44 @@ public:
                   int strideWidth,
                   int paddingHeight,
                   int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
   }
 };
 
 template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
 template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
 
 }  // namespace paddle
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index fe4c8fefcf5e8..f227f6d0e10fa 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -293,6 +293,7 @@ REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
 REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
 #ifndef PADDLE_ONLY_CPU
 REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward);
+REGISTER_TYPED_FUNC(ImageExpandGrad, GPU, ImageExpandBackward);
 #endif
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 1889b347c2d89..a5e644a4ae397 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -46,14 +46,12 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
                      .set("strides", strides)
                      .set("paddings", paddings)
                      .set("blocks", blocks));
-  if (!useGpu_) {
-    createFunction(backward_,
-                   "ImageExpandGrad",
-                   FuncConfig()
-                       .set("strides", strides)
-                       .set("paddings", paddings)
-                       .set("blocks", blocks));
-  }
+  createFunction(backward_,
+                 "ImageExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
 
   return true;
 }
@@ -110,14 +108,16 @@ void BlockExpandLayer::forward(PassType passType) {
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  size_t blockNum = outputH_ * outputW_;
-  size_t blockSize = blockH_ * blockW_ * channels_;
   /* Calculate the input layers error */
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (!preGrad) {
-    return;
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 
+#if 0
   if (useGpu_) {
     MatrixPtr grad = getOutputGrad();
     MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
@@ -155,13 +155,8 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
                              1.0,
                              1.0);
     }
-  } else {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getOutputGrad(), outputShape_);
-    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
-    backward_[0]->calc(inputs, outputs);
   }
+#endif
 }
 
 }  // namespace paddle

From bf6dfc1ff2a01cc35bf6a91177463cd40e328003 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 13 Jun 2017 21:30:32 +0800
Subject: [PATCH 09/20] Remove some of the code that has been commented out.

---
 paddle/gserver/layers/BlockExpandLayer.cpp | 41 ----------------------
 1 file changed, 41 deletions(-)

diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index a5e644a4ae397..adc9a814ffbb1 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -116,47 +116,6 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
     outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
     backward_[0]->calc(inputs, outputs);
   }
-
-#if 0
-  if (useGpu_) {
-    MatrixPtr grad = getOutputGrad();
-    MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-    size_t batchSize = preGrad->getHeight();
-
-    CHECK_EQ(batchSize * blockNum, grad->getHeight());
-    CHECK_EQ(blockSize, grad->getWidth());
-
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr gradTmp =
-          Matrix::create(grad->getData() + i * blockNum * blockSize,
-                         blockNum,
-                         blockSize,
-                         false,
-                         useGpu_);
-      gradTmp->transpose(gradTrans, false);
-      MatrixPtr preGradTmp =
-          Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                         1,
-                         preGrad->getWidth(),
-                         false,
-                         useGpu_);
-      preGradTmp->convShrink(*gradTrans,
-                             imgSizeH_,
-                             imgSizeW_,
-                             channels_,
-                             blockH_,
-                             blockW_,
-                             strideH_,
-                             strideW_,
-                             paddingH_,
-                             paddingW_,
-                             outputH_,
-                             outputW_,
-                             1.0,
-                             1.0);
-    }
-  }
-#endif
 }
 
 }  // namespace paddle

From d558b8bb82d6428b58f7ceb60ea87afcadce03ba Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 10:36:36 +0800
Subject: [PATCH 10/20] Move the code in the GemmConvOpGpu.cu file into
 Im2ColOpGpu.cu.

---
 paddle/function/Im2ColOpGpu.cu | 172 +++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index bddd8ffc7c0b4..361ecc4401a16 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -17,6 +17,178 @@ limitations under the License. */
 
 namespace paddle {
 
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
 template<class T>
 __global__
 void im2colOCF(const T* imData, T* colData,

From eb0c7e5ebc9a8c267cf4dc399beeb6b93dcbe6c6 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 11:03:30 +0800
Subject: [PATCH 11/20] Move the Im2Col code of the CPU version into the
 Im2ColOp.cpp file.

---
 paddle/function/Im2ColOp.cpp      | 235 ++++++++++++++++++++++++++++++
 paddle/function/Im2ColOpGpu.cu    |  26 +++-
 paddle/function/ImageExpandOp.cpp | 108 --------------
 3 files changed, 253 insertions(+), 116 deletions(-)
 create mode 100644 paddle/function/Im2ColOp.cpp

diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
new file mode 100644
index 0000000000000..b7d1eb1eded7a
--- /dev/null
+++ b/paddle/function/Im2ColOp.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index 361ecc4401a16..15ba854009636 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -57,6 +57,11 @@ void im2col(const T* data_im, int numOuts, int height, int width,
   }
 }
 
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
 public:
@@ -71,10 +76,10 @@ public:
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
 
     int numKernels = inputChannels * outputHeight * outputWidth;
     int blocks = (numKernels + 1024 -1) / 1024;
@@ -135,6 +140,11 @@ void col2im(size_t n, const T* data_col, size_t height,
   }
 }
 
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
 public:
@@ -149,10 +159,10 @@ public:
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
 
     size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
         * (inputWidth + 2*paddingWidth);
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index f227f6d0e10fa..625bf5b6edf44 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -17,114 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
-                  imColOffset < 0 || imColOffset >= inputWidth) {
-                colData[colDataOffset] = float(0);
-              } else {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                colData[colDataOffset] = imData[imDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
-                  imColOffset >= 0 && imColOffset < inputWidth) {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                imData[imDataOffset] += colData[colDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 /*
  * \brief Converts the image data of four dimensions(NCHW) into
  *        a sequence data of three dimensions(NST) in the forward calculation,

From 07cde439aae38137c42f662382e36d08c03d37fd Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 11:18:58 +0800
Subject: [PATCH 12/20] Reconstruction of GemmConv Based on new im2col.

---
 paddle/function/GemmConvOp.cpp | 185 +++++++++------------------------
 1 file changed, 48 insertions(+), 137 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e766..3f10bb9c83754 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "GemmConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
+#include "Im2Col.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
 /*
  * \brief Forward calculation of convolution.
  */
@@ -155,15 +67,20 @@ class GemmConvFunction : public ConvFunctionBase {
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -171,18 +88,13 @@ class GemmConvFunction : public ConvFunctionBase {
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
         im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
+               paddingW());
 
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
@@ -249,15 +161,20 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Col2ImFunctor<Device, real> col2im;
+    Col2ImFunctor<kCFO, Device, real> col2im;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -280,20 +197,14 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
              0.0f,
              colData,
              N);
-
-        col2im(colData,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+        col2im(inputGrad + g * inputOffset,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               inputGrad + g * inputOffset);
+               paddingW());
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
@@ -347,33 +258,33 @@ class GemmConvGradFilterFunction : public ConvFunctionBase {
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
     real* filterGrad = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
         im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
+               paddingW());
 
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;

From 9e6ed83cc4295414436ab784db10bf715637cddf Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 11:26:40 +0800
Subject: [PATCH 13/20] Fix ImageExpandFunction.

---
 paddle/function/ImageExpandOp.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 625bf5b6edf44..ca1d117db8845 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -45,9 +45,7 @@ class ImageExpandFunction : public FunctionBase {
     numOutputs_ = 1;
   }
 
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  void check(const TensorShape& image, const TensorShape& sequence) const {
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
     // image shape should be 4-dimensional.
     CHECK_EQ(image.ndims(), (size_t)4);
     // sequence shape should be 3-dimensional.
@@ -108,12 +106,18 @@ class ImageExpandForward : public ImageExpandFunction {
     ImageExpandFunction::init(config);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
     const TensorShape& image = inputs[0].shape();
     const TensorShape& sequence = outputs[0].shape();
-    check(image, sequence);
 
     TensorShape imShape = TensorShape({image[1], image[2], image[3]});
     TensorShape colShape = getColShape(image, sequence);
@@ -149,15 +153,21 @@ class ImageExpandBackward : public ImageExpandFunction {
     ImageExpandFunction::init(config);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
     // Since the implementation of Col2ImFunctor is ADD_TO,
     // this function only supports ADD_TO mode.
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     const TensorShape& image = outputs[0].shape();
     const TensorShape& sequence = inputs[0].shape();
-    check(image, sequence);
 
     TensorShape imShape = TensorShape({image[1], image[2], image[3]});
     TensorShape colShape = getColShape(image, sequence);

From 5bfcb7f853834009facd51ce5e2a989240bc3fcc Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 11:31:08 +0800
Subject: [PATCH 14/20] Remove useless code.

---
 paddle/function/GemmConvOp.h     |  62 -----------
 paddle/function/GemmConvOpGpu.cu | 186 -------------------------------
 2 files changed, 248 deletions(-)
 delete mode 100644 paddle/function/GemmConvOp.h
 delete mode 100644 paddle/function/GemmConvOpGpu.cu

diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
deleted file mode 100644
index 9f11cce597a07..0000000000000
--- a/paddle/function/GemmConvOp.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData);
-};
-
-template <DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData);
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu
deleted file mode 100644
index 2a1795ff0fb56..0000000000000
--- a/paddle/function/GemmConvOpGpu.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmConvOp.h"
-
-namespace paddle {
-
-template<class T>
-__global__
-void im2col(const T* data_im, int numOuts, int height, int width,
-            int blockH, int blockW,
-            int strideH, int strideW,
-            int paddingH, int paddingW,
-            int height_col, int width_col,
-            T* data_col) {
-  int index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 -1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
-         strideHeight, strideWidth, paddingHeight, paddingWidth,
-         outputHeight, outputWidth, colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template<class T>
-__global__
-void col2im(size_t n, const T* data_col, size_t height,
-            size_t width, size_t channels,
-            size_t blockH, size_t blockW,
-            size_t strideH, size_t strideW,
-            size_t paddingH, size_t paddingW,
-            size_t height_col, size_t width_col,
-            T* data_im) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] += val;
-    }
-  }
-}
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
-        * (inputWidth + 2*paddingWidth);
-
-    size_t blocks = (numKernels + 1024 -1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (numKernels,
-              colData,
-              inputHeight + 2*paddingHeight,
-              inputWidth + 2*paddingWidth,
-              inputChannels,
-              filterHeight,
-              filterWidth,
-              strideHeight,
-              strideWidth,
-              paddingHeight,
-              paddingWidth,
-              outputHeight,
-              outputWidth,
-              imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle

From 09d712d6aec0376b5ccea09e0d2c546ea1149aba Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 11:38:25 +0800
Subject: [PATCH 15/20] Remove useless code(Matrix::convExpand and
 Matrix::convShrink).

---
 paddle/cuda/include/hl_cnn.h           |  67 ----------
 paddle/cuda/include/stub/hl_cnn_stub.h |  30 -----
 paddle/cuda/src/hl_cuda_cnn.cu         | 128 ------------------
 paddle/math/Matrix.cpp                 | 172 -------------------------
 paddle/math/Matrix.h                   |  99 --------------
 5 files changed, 496 deletions(-)

diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9ebb..9f84db72da24b 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -17,73 +17,6 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-/**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
 /**
  * @brief   Maximum pool forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc695..2bbb9fa8dfd5e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4a25..b6e3e63a4f522 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -18,134 +18,6 @@ limitations under the License. */
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
 __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int channels, const int height,
                                  const int width,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164ebf..a3ad9d46e4988 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1775,103 +1700,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4c90..bbf98a609c63e 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -858,49 +858,6 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
@@ -1334,34 +1291,6 @@ class GpuMatrix : public Matrix {
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1521,34 +1450,6 @@ class CpuMatrix : public Matrix {
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,

From 86a679b0c485cac9df354e2c37abaacc8ea9771d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 21 Jun 2017 17:07:55 +0800
Subject: [PATCH 16/20] Add unit test of ImageExpandOp.

---
 paddle/function/CMakeLists.txt        |   1 +
 paddle/function/ImageExpandOp.cpp     |   1 +
 paddle/function/ImageExpandOpTest.cpp | 107 ++++++++++++++++++++++++++
 3 files changed, 109 insertions(+)
 create mode 100644 paddle/function/ImageExpandOpTest.cpp

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 5e170714cf5b1..19f64eefd184e 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -29,6 +29,7 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(ImageExpandOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index ca1d117db8845..00a2571936b39 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -116,6 +116,7 @@ class ImageExpandForward : public ImageExpandFunction {
     CHECK_EQ(numInputs_, inputs.size());
     CHECK_EQ(numOutputs_, outputs.size());
     check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
     const TensorShape& image = inputs[0].shape();
     const TensorShape& sequence = outputs[0].shape();
 
diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/ImageExpandOpTest.cpp
new file mode 100644
index 0000000000000..fb312549dc758
--- /dev/null
+++ b/paddle/function/ImageExpandOpTest.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ImageExpandForward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("ImageExpand",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(ImageExpandBackward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("ImageExpandGrad",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
+                                ADD_TO);
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle

From c7610106032f63a0dea4d87bca88a61fc21fe8e3 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 27 Jun 2017 13:32:06 +0800
Subject: [PATCH 17/20] Add unit test for im2col.

---
 paddle/function/CMakeLists.txt |   1 +
 paddle/function/Im2ColTest.cpp | 110 +++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 paddle/function/Im2ColTest.cpp

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 19f64eefd184e..178d1153f43ad 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -33,6 +33,7 @@ if(WITH_GPU)
 endif()
 
 add_simple_unittest(ConvOpTest)
+add_simple_unittest(Im2ColTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000..d7dbf087c5f9c
--- /dev/null
+++ b/paddle/function/Im2ColTest.cpp
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+TEST(Im2ColFunctor, real) {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                  break;
+                if (padding >= filterHeight || padding >= filterWidth) break;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+
+                TensorShape imShape =
+                    TensorShape({channels, inputHeight, inputWidth});
+                TensorShape colShape1 = TensorShape({channels,
+                                                     filterHeight,
+                                                     filterWidth,
+                                                     outputHeight,
+                                                     outputWidth});
+                TensorShape colShape2 = TensorShape({outputHeight,
+                                                     outputWidth,
+                                                     channels,
+                                                     filterHeight,
+                                                     filterWidth});
+
+                VectorPtr input = Vector::create(imShape.getElements(), false);
+                size_t height = channels * filterHeight * filterWidth;
+                size_t width = outputHeight * outputWidth;
+                MatrixPtr output1 = Matrix::create(height, width, false, false);
+                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, real> im2col1;
+                Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, real> im2col2;
+
+                input->uniform(0.001, 1);
+                im2col1(input->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                im2col2(input->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                MatrixPtr test;
+                output2->transpose(test, true);
+                autotest::TensorCheckErr(*output1, *test);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#if 0
+TEST(Col2ImFunctor, real) {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace paddle

From a83d52151cbe6ed82b0b35eb21219442a8ac926a Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 27 Jun 2017 17:34:24 +0800
Subject: [PATCH 18/20] Add unit test for Col2ImFunctor.

---
 paddle/function/Im2ColTest.cpp | 63 +++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index d7dbf087c5f9c..acc88a553abe7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -20,7 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 
-TEST(Im2ColFunctor, real) {
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
   for (size_t channels : {1, 5, 32}) {
     for (size_t inputHeight : {5, 33, 100}) {
       for (size_t inputWidth : {5, 32, 96}) {
@@ -50,16 +51,18 @@ TEST(Im2ColFunctor, real) {
                                                      filterHeight,
                                                      filterWidth});
 
-                VectorPtr input = Vector::create(imShape.getElements(), false);
                 size_t height = channels * filterHeight * filterWidth;
                 size_t width = outputHeight * outputWidth;
+                VectorPtr input1 = Vector::create(imShape.getElements(), false);
+                VectorPtr input2 = Vector::create(imShape.getElements(), false);
                 MatrixPtr output1 = Matrix::create(height, width, false, false);
                 MatrixPtr output2 = Matrix::create(width, height, false, false);
-                Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, real> im2col1;
-                Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, real> im2col2;
+                input1->uniform(0.001, 1);
+                input2->copyFrom(*input1);
 
-                input->uniform(0.001, 1);
-                im2col1(input->getData(),
+                Im2ColFunctor<kCFO, Device, T> im2Col1;
+                Im2ColFunctor<kOCF, Device, T> im2Col2;
+                im2Col1(input1->getData(),
                         imShape,
                         output1->getData(),
                         colShape1,
@@ -67,7 +70,7 @@ TEST(Im2ColFunctor, real) {
                         stride,
                         padding,
                         padding);
-                im2col2(input->getData(),
+                im2Col2(input2->getData(),
                         imShape,
                         output2->getData(),
                         colShape2,
@@ -76,27 +79,32 @@ TEST(Im2ColFunctor, real) {
                         padding,
                         padding);
 
+                // The transposition of the result of ColFormat == kCFO
+                // is equal to the result of ColFormat == kOCF.
                 MatrixPtr test;
                 output2->transpose(test, true);
                 autotest::TensorCheckErr(*output1, *test);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
 
-#if 0
-TEST(Col2ImFunctor, real) {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
+                Col2ImFunctor<kCFO, Device, T> col2Im1;
+                Col2ImFunctor<kOCF, Device, T> col2Im2;
+                col2Im1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                col2Im2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                autotest::TensorCheckErr(*input1, *input2);
               }
             }
           }
@@ -105,6 +113,13 @@ TEST(Col2ImFunctor, real) {
     }
   }
 }
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
 #endif
 
 }  // namespace paddle

From a7ff11404d097f759aaa2142458750631a9b7641 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 27 Jun 2017 17:53:31 +0800
Subject: [PATCH 19/20] Change the ImageFunction name to
 BlockFunction(Consistent with the name of Layer).

---
 paddle/function/ImageExpandOp.cpp          | 18 +++++++++---------
 paddle/function/ImageExpandOpTest.cpp      |  8 ++++----
 paddle/gserver/layers/BlockExpandLayer.cpp |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp
index 00a2571936b39..a89b6bba45843 100644
--- a/paddle/function/ImageExpandOp.cpp
+++ b/paddle/function/ImageExpandOp.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  * \param inputs[0]  Sequence data of NST format.
  * \param outputs[0] Image data of NCHW format.
  */
-class ImageExpandFunction : public FunctionBase {
+class BlockExpandFunction : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
     // function arguments
@@ -100,10 +100,10 @@ class ImageExpandFunction : public FunctionBase {
 };
 
 template <DeviceType Device>
-class ImageExpandForward : public ImageExpandFunction {
+class BlockExpandForward : public BlockExpandFunction {
 public:
   void init(const FuncConfig& config) override {
-    ImageExpandFunction::init(config);
+    BlockExpandFunction::init(config);
   }
 
   void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -148,10 +148,10 @@ class ImageExpandForward : public ImageExpandFunction {
 };
 
 template <DeviceType Device>
-class ImageExpandBackward : public ImageExpandFunction {
+class BlockExpandBackward : public BlockExpandFunction {
 public:
   void init(const FuncConfig& config) override {
-    ImageExpandFunction::init(config);
+    BlockExpandFunction::init(config);
   }
 
   void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -192,11 +192,11 @@ class ImageExpandBackward : public ImageExpandFunction {
   }
 };
 
-REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward);
-REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward);
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
 #ifndef PADDLE_ONLY_CPU
-REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward);
-REGISTER_TYPED_FUNC(ImageExpandGrad, GPU, ImageExpandBackward);
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif
 
 }  // namespace paddle
diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/ImageExpandOpTest.cpp
index fb312549dc758..5e4897e72ba9f 100644
--- a/paddle/function/ImageExpandOpTest.cpp
+++ b/paddle/function/ImageExpandOpTest.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-TEST(ImageExpandForward, real) {
+TEST(BlockExpandForward, real) {
   for (size_t batchSize : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
       for (size_t inputHeight : {5, 33, 100}) {
@@ -29,7 +29,7 @@ TEST(ImageExpandForward, real) {
                 std::vector<size_t> strides = {stride, stride};
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("ImageExpand",
+                CpuGpuFuncCompare test("BlockExpand",
                                        FuncConfig()
                                            .set("strides", strides)
                                            .set("paddings", paddings)
@@ -60,7 +60,7 @@ TEST(ImageExpandForward, real) {
   }
 }
 
-TEST(ImageExpandBackward, real) {
+TEST(BlockExpandBackward, real) {
   for (size_t batchSize : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
       for (size_t inputHeight : {5, 33, 100}) {
@@ -72,7 +72,7 @@ TEST(ImageExpandBackward, real) {
                 std::vector<size_t> strides = {stride, stride};
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("ImageExpandGrad",
+                CpuGpuFuncCompare test("BlockExpandGrad",
                                        FuncConfig()
                                            .set("strides", strides)
                                            .set("paddings", paddings)
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index adc9a814ffbb1..3b1f346359172 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -41,13 +41,13 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
   std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
   createFunction(forward_,
-                 "ImageExpand",
+                 "BlockExpand",
                  FuncConfig()
                      .set("strides", strides)
                      .set("paddings", paddings)
                      .set("blocks", blocks));
   createFunction(backward_,
-                 "ImageExpandGrad",
+                 "BlockExpandGrad",
                  FuncConfig()
                      .set("strides", strides)
                      .set("paddings", paddings)

From 7a550f90d8a7a1aea81f300d127f3aef975f8693 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 27 Jun 2017 18:05:14 +0800
Subject: [PATCH 20/20] Fix the function file name.

---
 paddle/function/{ImageExpandOp.cpp => BlockExpandOp.cpp}        | 0
 .../function/{ImageExpandOpTest.cpp => BlockExpandOpTest.cpp}   | 0
 paddle/function/CMakeLists.txt                                  | 2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename paddle/function/{ImageExpandOp.cpp => BlockExpandOp.cpp} (100%)
 rename paddle/function/{ImageExpandOpTest.cpp => BlockExpandOpTest.cpp} (100%)

diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
similarity index 100%
rename from paddle/function/ImageExpandOp.cpp
rename to paddle/function/BlockExpandOp.cpp
diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp
similarity index 100%
rename from paddle/function/ImageExpandOpTest.cpp
rename to paddle/function/BlockExpandOpTest.cpp
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 178d1153f43ad..bef4d2955b3c9 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -29,7 +29,7 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
-    add_simple_unittest(ImageExpandOpTest)
+    add_simple_unittest(BlockExpandOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)