From 48e0f432537a97e915306601a8b5c8b72a77b6d1 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 12 Jun 2017 21:22:15 +0800 Subject: [PATCH 01/20] Add ImageExpandFunction. --- paddle/function/GemmConvOp.h | 84 +++++++++++++++ paddle/function/ImageExpandOp.cpp | 164 ++++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 paddle/function/GemmConvOp.h create mode 100644 paddle/function/ImageExpandOp.cpp diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h new file mode 100644 index 0000000000000..25d2e220bfb93 --- /dev/null +++ b/paddle/function/GemmConvOp.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// #include "ConvOp.h" + +namespace paddle { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of four dimensions(NCHW) into a colData. + * Then you can reshape colData to a convolution matrix for + * convolution calculation based on matrix multiplication. + * + * \param imData Image data of NCHW format. + * The format of imData is: + * [input_channels, input_height, input_width]. + * \param colData colData data. + * If the template argument Format is kCFO, + * the format of colData is: + * [input_channels, + * filter_height, + * filter_width, + * output_height, + * output_width] + * If the template argument Format is kOCF, + * the format of colData is: + * [output_height, + * output_width, + * input_channels, + * filter_height, + * filter_width] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* colData); +}; + +template +class Col2ImFunctor { +public: + void operator()(const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* imData); +}; + +} // namespace paddle diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp new file mode 100644 index 0000000000000..426b6c8e312e4 --- /dev/null +++ b/paddle/function/ImageExpandOp.cpp @@ -0,0 +1,164 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Function.h" +#include "GemmConvOp.h" + +namespace paddle { + +/* + * imData = [input_channels, input_height, input_width] + * colData = [output_height, output_width, + * input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* colData) { + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset < 0 || imRowOffset >= inputHeight || + imColOffset < 0 || imColOffset >= inputWidth) { + colData[colDataOffset] = T(0); + } else { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + colData[colDataOffset] = imData[imDataOffset]; + } + } + } + } + } + } + } +}; + +/* + * \brief Converts the image data of four dimensions(NCHW) into + * a sequence data of three dimensions(NST). Where N is batch size, + * S is the length of the sequence after each image is expanded, + * T is the size of each time step in the sequence. + * + * \param inputs[0] Image data of NCHW format. + * \param outputs[0] Sequence data of NST format. + */ +template +class ImageExpandFunction : public FunctionBase { +public: + void init(const FuncConfig& config) override { + // function arguments + strides_ = config.get>("strides"); + paddings_ = config.get>("paddings"); + blocks_ = config.get>("blocks"); + + // number of inputs and outputs + numInputs_ = 1; + numOutputs_ = 1; + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + const TensorShape& input = inputs[0].shape(); + const TensorShape& output = outputs[0].shape(); + // input argument should be 4-dimensional. + CHECK_EQ(input.ndims(), (size_t)4); + // output argument should be 3-dimensional. + CHECK_EQ(output.ndims(), (size_t)3); + // The batchSize of the input needs to be equal to + // the batchSize of the output. + CHECK_EQ(input[0], output[0]); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t seqLength = output[1]; + size_t stepSize = output[2]; + size_t outputHeight = + 1 + + (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH(); + size_t outputWidth = + 1 + + (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW(); + CHECK_EQ(seqLength, outputHeight * outputWidth); + CHECK_EQ(stepSize, inputChannels * blockH() * blockH()); + + real* inputData = inputs[0].data(); + real* outputData = outputs[0].data(); + Im2ColFunctor im2col; + for (size_t i = 0; i < batchSize; i++) { + im2col(inputData, + inputChannels, + inputHeight, + inputWidth, + blockH(), + blockW(), + strideH(), + strideW(), + paddingH(), + paddingW(), + outputHeight, + outputWidth, + outputData); + inputData += inputChannels * inputHeight * inputWidth; + outputData += seqLength * stepSize; + } + } + +protected: + std::vector strides_; + std::vector paddings_; + std::vector blocks_; + + inline int strideH() const { return strides_[0]; } + + inline int strideW() const { return strides_[1]; } + + inline int paddingH() const { return paddings_[0]; } + + inline int paddingW() const { return paddings_[1]; } + + inline int blockH() const { return blocks_[0]; } + + inline int blockW() const { return blocks_[1]; } +}; + +} // namespace paddle From 61aa1098fd13339c5be752cd1dc8f0119296c966 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 10:51:52 +0800 Subject: [PATCH 02/20] BlockExpandLayer based on the ImageExpand Function. --- paddle/function/ImageExpandOp.cpp | 9 ++- paddle/gserver/layers/BlockExpandLayer.cpp | 80 ++++++++++++++-------- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index 426b6c8e312e4..0c10f30bbd9c1 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -119,12 +119,17 @@ class ImageExpandFunction : public FunctionBase { 1 + (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW(); CHECK_EQ(seqLength, outputHeight * outputWidth); - CHECK_EQ(stepSize, inputChannels * blockH() * blockH()); + CHECK_EQ(stepSize, inputChannels * blockH() * blockW()); real* inputData = inputs[0].data(); real* outputData = outputs[0].data(); Im2ColFunctor im2col; for (size_t i = 0; i < batchSize; i++) { + // The result of im2col is [output_height, output_width, + // input_channels, filter_height, filter_width], and it is easy to + // reshape into [seqLength, stepSize], where seqLength is equal + // output_height * output_width, stepSize is equal + // input_channels * filter_height * filter_width im2col(inputData, inputChannels, inputHeight, @@ -161,4 +166,6 @@ class ImageExpandFunction : public FunctionBase { inline int blockW() const { return blocks_[1]; } }; +REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction); + } // namespace paddle diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index 2bafeb92158c5..9760d39bb4a3c 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -37,6 +37,18 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, imgSizeH_ = blockConf.img_size_y(); imgSizeW_ = blockConf.img_size_x(); + if (!useGpu_) { + std::vector strides = {(size_t)strideH_, (size_t)strideW_}; + std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; + std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; + createFunction(forward_, + "ImageExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + } + return true; } @@ -63,10 +75,11 @@ void BlockExpandLayer::forward(PassType passType) { Layer::forward(passType); size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight(); - size_t blockNum = getBlockNum(); size_t blockSize = blockH_ * blockW_ * channels_; resetOutput(blockNum * batchSize, blockSize); + // TODO(hedaoyuan): After completing the GPU version of ImageExpand, + // refactor the following code. Argument& out = getOutput(); MatrixPtr outV = getOutputValue(); @@ -78,38 +91,49 @@ void BlockExpandLayer::forward(PassType passType) { int* start = out.sequenceStartPositions->getMutableData(false); int* dims = out.cpuSequenceDims->getData(); for (size_t i = 0; i < batchSize; i++) { - outVTrans_->zeroMem(); - /* expand each block as one row */ - MatrixPtr inputTmp = - Matrix::create(input->getData() + i * input->getWidth(), - 1, - input->getWidth(), - false, - useGpu_); - outVTrans_->convExpand(*inputTmp, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_); - MatrixPtr outVTmp = - Matrix::create(outV->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - outVTrans_->transpose(outVTmp, false); + if (useGpu_) { + outVTrans_->zeroMem(); + /* expand each block as one row */ + MatrixPtr inputTmp = + Matrix::create(input->getData() + i * input->getWidth(), + 1, + input->getWidth(), + false, + useGpu_); + outVTrans_->convExpand(*inputTmp, + imgSizeH_, + imgSizeW_, + channels_, + blockH_, + blockW_, + strideH_, + strideW_, + paddingH_, + paddingW_, + outputH_, + outputW_); + MatrixPtr outVTmp = + Matrix::create(outV->getData() + i * blockNum * blockSize, + blockNum, + blockSize, + false, + useGpu_); + outVTrans_->transpose(outVTmp, false); + } start[i] = i * blockNum; dims[2 * i] = outputH_; dims[2 * i + 1] = outputW_; } start[batchSize] = batchSize * blockNum; + if (!useGpu_) { + TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_}); + TensorShape outputShape({batchSize, blockNum, blockSize}); + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), inputShape); + outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO); + forward_[0]->calc(inputs, outputs); + } } void BlockExpandLayer::backward(const UpdateCallback& callback) { From 2acb84fe70104980c902b252a26a526a3d943c2a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 14:18:20 +0800 Subject: [PATCH 03/20] Add ImageExpandGrad Function. --- paddle/function/GemmConvOp.h | 1 + paddle/function/ImageExpandOp.cpp | 224 +++++++++++++++++---- paddle/gserver/layers/BlockExpandLayer.cpp | 89 ++++---- paddle/gserver/layers/BlockExpandLayer.h | 3 + 4 files changed, 237 insertions(+), 80 deletions(-) diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h index 25d2e220bfb93..f724643f35af9 100644 --- a/paddle/function/GemmConvOp.h +++ b/paddle/function/GemmConvOp.h @@ -44,6 +44,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 }; * input_channels, * filter_height, * filter_width] + * TODO(hedaoyuan): Refactor the arguments of the interface with TensorShape. */ template class Im2ColFunctor { diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index 0c10f30bbd9c1..4d8c25ffcdafa 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -70,16 +70,67 @@ class Im2ColFunctor { } }; +template +class Col2ImFunctor { +public: + void operator()(const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* imData) { + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset >= 0 && imRowOffset < inputHeight && + imColOffset >= 0 && imColOffset < inputWidth) { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + imData[imDataOffset] += colData[colDataOffset]; + } + } + } + } + } + } + } +}; + /* * \brief Converts the image data of four dimensions(NCHW) into - * a sequence data of three dimensions(NST). Where N is batch size, - * S is the length of the sequence after each image is expanded, - * T is the size of each time step in the sequence. + * a sequence data of three dimensions(NST) in the forward calculation, + * which is reversed in the backward calculation. + * Where N is batch size, S is the length of the sequence after each + * image is expanded, T is the size of each time step in the sequence. * + * Arguments in forward function: * \param inputs[0] Image data of NCHW format. * \param outputs[0] Sequence data of NST format. + * + * Arguments in backward function: + * \param inputs[0] Sequence data of NST format. + * \param outputs[0] Image data of NCHW format. */ -template class ImageExpandFunction : public FunctionBase { public: void init(const FuncConfig& config) override { @@ -93,25 +144,27 @@ class ImageExpandFunction : public FunctionBase { numOutputs_ = 1; } - void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { - CHECK_EQ(numInputs_, inputs.size()); - CHECK_EQ(numOutputs_, outputs.size()); - const TensorShape& input = inputs[0].shape(); - const TensorShape& output = outputs[0].shape(); - // input argument should be 4-dimensional. - CHECK_EQ(input.ndims(), (size_t)4); - // output argument should be 3-dimensional. - CHECK_EQ(output.ndims(), (size_t)3); - // The batchSize of the input needs to be equal to - // the batchSize of the output. - CHECK_EQ(input[0], output[0]); - - size_t batchSize = input[0]; - size_t inputChannels = input[1]; - size_t inputHeight = input[2]; - size_t inputWidth = input[3]; - size_t seqLength = output[1]; - size_t stepSize = output[2]; + virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {} + + void check(const TensorShape& image, const TensorShape& sequence) { + // image shape should be 4-dimensional. + CHECK_EQ(image.ndims(), (size_t)4); + // sequence shape should be 3-dimensional. + CHECK_EQ(sequence.ndims(), (size_t)3); + // The batchSize of the image needs to be equal to + // the batchSize of the sequence. + CHECK_EQ(image[0], sequence[0]); + } + + // Calculate the shape of colData based on the shape of the image + // and the shape of the sequence. + TensorShape getColShape(const TensorShape& image, + const TensorShape& sequence) { + size_t inputChannels = image[1]; + size_t inputHeight = image[2]; + size_t inputWidth = image[3]; + size_t seqLength = sequence[1]; + size_t stepSize = sequence[2]; size_t outputHeight = 1 + (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH(); @@ -121,8 +174,59 @@ class ImageExpandFunction : public FunctionBase { CHECK_EQ(seqLength, outputHeight * outputWidth); CHECK_EQ(stepSize, inputChannels * blockH() * blockW()); - real* inputData = inputs[0].data(); - real* outputData = outputs[0].data(); + // [output_height, output_width, + // input_channels, filter_height, filter_width] + return TensorShape({outputHeight, + outputWidth, + inputChannels, + (size_t)blockH(), + (size_t)blockW()}); + } + +protected: + std::vector strides_; + std::vector paddings_; + std::vector blocks_; + + inline int strideH() const { return strides_[0]; } + + inline int strideW() const { return strides_[1]; } + + inline int paddingH() const { return paddings_[0]; } + + inline int paddingW() const { return paddings_[1]; } + + inline int blockH() const { return blocks_[0]; } + + inline int blockW() const { return blocks_[1]; } +}; + +template +class ImageExpandForward : public ImageExpandFunction { +public: + void init(const FuncConfig& config) override { + ImageExpandFunction::init(config); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + const TensorShape& image = inputs[0].shape(); + const TensorShape& sequence = outputs[0].shape(); + check(image, sequence); + + TensorShape colShape = getColShape(image, sequence); + size_t batchSize = image[0]; + size_t inputChannels = image[1]; + size_t inputHeight = image[2]; + size_t inputWidth = image[3]; + size_t seqLength = sequence[1]; + size_t stepSize = sequence[2]; + size_t outputHeight = colShape[0]; + size_t outputWidth = colShape[1]; + + real* imageData = inputs[0].data(); + real* seqData = outputs[0].data(); Im2ColFunctor im2col; for (size_t i = 0; i < batchSize; i++) { // The result of im2col is [output_height, output_width, @@ -130,7 +234,7 @@ class ImageExpandFunction : public FunctionBase { // reshape into [seqLength, stepSize], where seqLength is equal // output_height * output_width, stepSize is equal // input_channels * filter_height * filter_width - im2col(inputData, + im2col(imageData, inputChannels, inputHeight, inputWidth, @@ -142,30 +246,64 @@ class ImageExpandFunction : public FunctionBase { paddingW(), outputHeight, outputWidth, - outputData); - inputData += inputChannels * inputHeight * inputWidth; - outputData += seqLength * stepSize; + seqData); + imageData += inputChannels * inputHeight * inputWidth; + seqData += seqLength * stepSize; } } +}; -protected: - std::vector strides_; - std::vector paddings_; - std::vector blocks_; - - inline int strideH() const { return strides_[0]; } - - inline int strideW() const { return strides_[1]; } - - inline int paddingH() const { return paddings_[0]; } +template +class ImageExpandBackward : public ImageExpandFunction { +public: + void init(const FuncConfig& config) override { + ImageExpandFunction::init(config); + } - inline int paddingW() const { return paddings_[1]; } + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + // Since the implementation of Col2ImFunctor is ADD_TO, + // this function only supports ADD_TO mode. + CHECK_EQ(outputs[0].getArgType(), ADD_TO); + const TensorShape& image = outputs[0].shape(); + const TensorShape& sequence = inputs[0].shape(); + check(image, sequence); - inline int blockH() const { return blocks_[0]; } + TensorShape colShape = getColShape(image, sequence); + size_t batchSize = image[0]; + size_t inputChannels = image[1]; + size_t inputHeight = image[2]; + size_t inputWidth = image[3]; + size_t seqLength = sequence[1]; + size_t stepSize = sequence[2]; + size_t outputHeight = colShape[0]; + size_t outputWidth = colShape[1]; - inline int blockW() const { return blocks_[1]; } + real* imageData = outputs[0].data(); + real* seqData = inputs[0].data(); + Col2ImFunctor col2im; + for (size_t i = 0; i < batchSize; i++) { + col2im(seqData, + inputChannels, + inputHeight, + inputWidth, + blockH(), + blockW(), + strideH(), + strideW(), + paddingH(), + paddingW(), + outputHeight, + outputWidth, + imageData); + imageData += inputChannels * inputHeight * inputWidth; + seqData += seqLength * stepSize; + } + } }; -REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandFunction); +REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward); +REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward); } // namespace paddle diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index 9760d39bb4a3c..c8d0b21c8754d 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -47,6 +47,12 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, .set("strides", strides) .set("paddings", paddings) .set("blocks", blocks)); + createFunction(backward_, + "ImageExpandGrad", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); } return true; @@ -126,12 +132,12 @@ void BlockExpandLayer::forward(PassType passType) { } start[batchSize] = batchSize * blockNum; if (!useGpu_) { - TensorShape inputShape({batchSize, channels_, imgSizeH_, imgSizeW_}); - TensorShape outputShape({batchSize, blockNum, blockSize}); + inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); + outputShape_ = TensorShape({batchSize, blockNum, blockSize}); BufferArgs inputs; BufferArgs outputs; - inputs.addArg(*getInputValue(0), inputShape); - outputs.addArg(*getOutputValue(), outputShape, ASSIGN_TO); + inputs.addArg(*getInputValue(0), inputShape_); + outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); forward_[0]->calc(inputs, outputs); } } @@ -144,41 +150,50 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) { if (!preGrad) { return; } - MatrixPtr grad = getOutputGrad(); - MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_); - size_t batchSize = preGrad->getHeight(); - CHECK_EQ(batchSize * blockNum, grad->getHeight()); - CHECK_EQ(blockSize, grad->getWidth()); + if (useGpu_) { + MatrixPtr grad = getOutputGrad(); + MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_); + size_t batchSize = preGrad->getHeight(); - for (size_t i = 0; i < batchSize; i++) { - MatrixPtr gradTmp = - Matrix::create(grad->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - gradTmp->transpose(gradTrans, false); - MatrixPtr preGradTmp = - Matrix::create(preGrad->getData() + i * preGrad->getWidth(), - 1, - preGrad->getWidth(), - false, - useGpu_); - preGradTmp->convShrink(*gradTrans, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_, - 1.0, - 1.0); + CHECK_EQ(batchSize * blockNum, grad->getHeight()); + CHECK_EQ(blockSize, grad->getWidth()); + + for (size_t i = 0; i < batchSize; i++) { + MatrixPtr gradTmp = + Matrix::create(grad->getData() + i * blockNum * blockSize, + blockNum, + blockSize, + false, + useGpu_); + gradTmp->transpose(gradTrans, false); + MatrixPtr preGradTmp = + Matrix::create(preGrad->getData() + i * preGrad->getWidth(), + 1, + preGrad->getWidth(), + false, + useGpu_); + preGradTmp->convShrink(*gradTrans, + imgSizeH_, + imgSizeW_, + channels_, + blockH_, + blockW_, + strideH_, + strideW_, + paddingH_, + paddingW_, + outputH_, + outputW_, + 1.0, + 1.0); + } + } else { + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), outputShape_); + outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO); + backward_[0]->calc(inputs, outputs); } } diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index 8f347400e60ec..edda0e0b630ae 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -53,6 +53,9 @@ class BlockExpandLayer : public Layer { /// auxiliary variable, which saves the transposed output value. MatrixPtr outVTrans_; + TensorShape inputShape_; + TensorShape outputShape_; + public: explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {} From 0672d330a3d4f55c54ce8568c974a10c02ba40cf Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 15:42:17 +0800 Subject: [PATCH 04/20] Use the TensorShape to reconstruct the arguments of the Im2ColFunctor and Col2ImFunctor interfaces. --- paddle/function/Im2Col.h | 92 +++++++++++++++++++++++ paddle/function/ImageExpandOp.cpp | 120 +++++++++++++----------------- 2 files changed, 145 insertions(+), 67 deletions(-) create mode 100644 paddle/function/Im2Col.h diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h new file mode 100644 index 0000000000000..d461ec7510b48 --- /dev/null +++ b/paddle/function/Im2Col.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of three dimensions(CHW) into a colData of + * five dimensions in the Im2ColFunctor calculation, + * And in the Col2ImFunctor calculation, it is reversed. + * + * \param imData Image data of NCHW format. + * The shape of imData is: + * [inputChannels, inputHeight, inputWidth]. + * \param colData colData data. + * + * If the template argument Format is kCFO, the shape of colData is: + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * inputChannels * filterHeight * filterWidth, and the width is equal + * outputHeight * outputWidth. + * + * Reshape: + * shape of colData shape of sequence + * [inputChannels, + * filterHeight, + * filterWidth, ======> [seqLength, stepSize] + * outputHeight, + * outputWidth] + * + * If the template argument Format is kOCF, the shape of colData is: + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + * So, it is easy to reshape into a sequence matrix for rnn calculation. + * The shape of sequence matrix is [seqLength, stepSize], where the seqLength + * is equal outputHeight * outputWidth, and the stepSize is equal + * inputChannels * filterHeight * filterWidth. + * + * Reshape: + * shape of colData shape of sequence + * [outputHeight, + * outputWidth, + * inputChannels, ======> [seqLength, stepSize] + * filterHeight, + * filterWidth] + * + * \note The caller needs to ensure that imShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth); +}; + +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth); +}; + +} // namespace paddle diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index 4d8c25ffcdafa..ad34967bd6580 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -13,31 +13,33 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "Function.h" -#include "GemmConvOp.h" +#include "Im2Col.h" namespace paddle { /* - * imData = [input_channels, input_height, input_width] - * colData = [output_height, output_width, - * input_channels, filter_height, filter_width] + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] */ template class Im2ColFunctor { public: void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, int strideHeight, int strideWidth, int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData) { + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; for (int outputH = 0; outputH < outputHeight; ++outputH) { for (int outputW = 0; outputW < outputWidth; ++outputW) { for (int channel = 0; channel < inputChannels; ++channel) { @@ -55,7 +57,7 @@ class Im2ColFunctor { filterW; if (imRowOffset < 0 || imRowOffset >= inputHeight || imColOffset < 0 || imColOffset >= inputWidth) { - colData[colDataOffset] = T(0); + colData[colDataOffset] = float(0); } else { int imDataOffset = (channel * inputHeight + imRowOffset) * inputWidth + @@ -70,22 +72,29 @@ class Im2ColFunctor { } }; +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ template class Col2ImFunctor { public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, int strideHeight, int strideWidth, int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData) { + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; for (int outputH = 0; outputH < outputHeight; ++outputH) { for (int outputW = 0; outputW < outputWidth; ++outputW) { for (int channel = 0; channel < inputChannels; ++channel) { @@ -146,7 +155,7 @@ class ImageExpandFunction : public FunctionBase { virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {} - void check(const TensorShape& image, const TensorShape& sequence) { + void check(const TensorShape& image, const TensorShape& sequence) const { // image shape should be 4-dimensional. CHECK_EQ(image.ndims(), (size_t)4); // sequence shape should be 3-dimensional. @@ -159,7 +168,7 @@ class ImageExpandFunction : public FunctionBase { // Calculate the shape of colData based on the shape of the image // and the shape of the sequence. TensorShape getColShape(const TensorShape& image, - const TensorShape& sequence) { + const TensorShape& sequence) const { size_t inputChannels = image[1]; size_t inputHeight = image[2]; size_t inputWidth = image[3]; @@ -174,8 +183,7 @@ class ImageExpandFunction : public FunctionBase { CHECK_EQ(seqLength, outputHeight * outputWidth); CHECK_EQ(stepSize, inputChannels * blockH() * blockW()); - // [output_height, output_width, - // input_channels, filter_height, filter_width] + // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] return TensorShape({outputHeight, outputWidth, inputChannels, @@ -215,40 +223,29 @@ class ImageExpandForward : public ImageExpandFunction { const TensorShape& sequence = outputs[0].shape(); check(image, sequence); + TensorShape imShape = TensorShape({image[1], image[2], image[3]}); TensorShape colShape = getColShape(image, sequence); size_t batchSize = image[0]; - size_t inputChannels = image[1]; - size_t inputHeight = image[2]; - size_t inputWidth = image[3]; - size_t seqLength = sequence[1]; - size_t stepSize = sequence[2]; - size_t outputHeight = colShape[0]; - size_t outputWidth = colShape[1]; real* imageData = inputs[0].data(); real* seqData = outputs[0].data(); Im2ColFunctor im2col; for (size_t i = 0; i < batchSize; i++) { - // The result of im2col is [output_height, output_width, - // input_channels, filter_height, filter_width], and it is easy to + // The result of im2col is [outputHeight, outputWidth, + // inputChannels, filterHeight, filterWidth], and it is easy to // reshape into [seqLength, stepSize], where seqLength is equal // output_height * output_width, stepSize is equal // input_channels * filter_height * filter_width im2col(imageData, - inputChannels, - inputHeight, - inputWidth, - blockH(), - blockW(), + imShape, + seqData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - seqData); - imageData += inputChannels * inputHeight * inputWidth; - seqData += seqLength * stepSize; + paddingW()); + imageData += imShape.getElements(); + seqData += colShape.getElements(); } } }; @@ -270,35 +267,24 @@ class ImageExpandBackward : public ImageExpandFunction { const TensorShape& sequence = inputs[0].shape(); check(image, sequence); + TensorShape imShape = TensorShape({image[1], image[2], image[3]}); TensorShape colShape = getColShape(image, sequence); size_t batchSize = image[0]; - size_t inputChannels = image[1]; - size_t inputHeight = image[2]; - size_t inputWidth = image[3]; - size_t seqLength = sequence[1]; - size_t stepSize = sequence[2]; - size_t outputHeight = colShape[0]; - size_t outputWidth = colShape[1]; real* imageData = outputs[0].data(); real* seqData = inputs[0].data(); Col2ImFunctor col2im; for (size_t i = 0; i < batchSize; i++) { - col2im(seqData, - inputChannels, - inputHeight, - inputWidth, - blockH(), - blockW(), + col2im(imageData, + imShape, + seqData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - imageData); - imageData += inputChannels * inputHeight * inputWidth; - seqData += seqLength * stepSize; + paddingW()); + imageData += imShape.getElements(); + seqData += colShape.getElements(); } } }; From 9c009b4087afa0ac61425cd9e45f8c2e60e92568 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 15:43:48 +0800 Subject: [PATCH 05/20] Remove GemmConvOp.h file. --- paddle/function/GemmConvOp.h | 85 ------------------------------------ 1 file changed, 85 deletions(-) delete mode 100644 paddle/function/GemmConvOp.h diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h deleted file mode 100644 index f724643f35af9..0000000000000 --- a/paddle/function/GemmConvOp.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// #include "ConvOp.h" - -namespace paddle { - -/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ -enum ColFormat { kCFO = 0, kOCF = 1 }; - -/* - * \brief Converts the image data of four dimensions(NCHW) into a colData. - * Then you can reshape colData to a convolution matrix for - * convolution calculation based on matrix multiplication. - * - * \param imData Image data of NCHW format. - * The format of imData is: - * [input_channels, input_height, input_width]. - * \param colData colData data. - * If the template argument Format is kCFO, - * the format of colData is: - * [input_channels, - * filter_height, - * filter_width, - * output_height, - * output_width] - * If the template argument Format is kOCF, - * the format of colData is: - * [output_height, - * output_width, - * input_channels, - * filter_height, - * filter_width] - * TODO(hedaoyuan): Refactor the arguments of the interface with TensorShape. - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData); -}; - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData); -}; - -} // namespace paddle From 34362d938175a012841275849f3b8102d736b4c6 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 15:57:01 +0800 Subject: [PATCH 06/20] Fix some of the wrong comments in im2col.h file. --- paddle/function/Im2Col.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index d461ec7510b48..6d76e229bfc21 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -24,10 +24,11 @@ enum ColFormat { kCFO = 0, kOCF = 1 }; * five dimensions in the Im2ColFunctor calculation, * And in the Col2ImFunctor calculation, it is reversed. * - * \param imData Image data of NCHW format. - * The shape of imData is: - * [inputChannels, inputHeight, inputWidth]. - * \param colData colData data. + * \param imData Image data. + * \param imShape The shape of imData, + * [inputChannels, inputHeight, inputWidth]. + * \param colData Column data. + * \param colShape The shape of colData. * * If the template argument Format is kCFO, the shape of colData is: * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] @@ -38,10 +39,10 @@ enum ColFormat { kCFO = 0, kOCF = 1 }; * outputHeight * outputWidth. * * Reshape: - * shape of colData shape of sequence + * shape of colData shape of convolution matrix * [inputChannels, * filterHeight, - * filterWidth, ======> [seqLength, stepSize] + * filterWidth, ======> [height, width] * outputHeight, * outputWidth] * @@ -53,7 +54,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 }; * inputChannels * filterHeight * filterWidth. * * Reshape: - * shape of colData shape of sequence + * shape of colData shape of sequence matrix * [outputHeight, * outputWidth, * inputChannels, ======> [seqLength, stepSize] From 152bd2f9c867e8e165c3d22810281023880b3d16 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 20:30:02 +0800 Subject: [PATCH 07/20] Add the GPU version implementation of ImageExpand function. --- paddle/function/Im2Col.h | 3 + paddle/function/Im2ColOpGpu.cu | 130 +++++++++++++++++++++ paddle/function/ImageExpandOp.cpp | 3 + paddle/gserver/layers/BlockExpandLayer.cpp | 73 ++++-------- paddle/gserver/layers/BlockExpandLayer.h | 3 - 5 files changed, 156 insertions(+), 56 deletions(-) create mode 100644 paddle/function/Im2ColOpGpu.cu diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 6d76e229bfc21..48e2e32f9256f 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include "TensorShape.h" +#include "TensorType.h" + namespace paddle { /* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu new file mode 100644 index 0000000000000..1dac2585db721 --- /dev/null +++ b/paddle/function/Im2ColOpGpu.cu @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" + +namespace paddle { + +template +__global__ +void im2colOCF(const T* imData, T* colData, + int inputChannels, + int inputHeight, int inputWidth, + int filterHeight, int filterWidth, + int strideHeight, int strideWidth, + int paddingHeight, int paddingWidth, + int outputHeight, int outputWidth) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int swId = blockIdx.x; + int shId = blockIdx.y; + + for (int channelId = threadIdx.z; + channelId < inputChannels; + channelId += blockDim.z) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (idx < filterWidth && idy < filterHeight) { + if (heightOffset >= inputHeight || heightOffset < 0 || + widthOffset >= inputWidth || widthOffset < 0) { + colData[colOffset] = T(0); + } else { + colData[colOffset] = imData[imOffset]; + } + } + } +} + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int blockDimX = 0; + int blockDimY = 0; + if (filterHeight <= 4 && filterWidth <= 4) { + blockDimX = 4; + blockDimY = 4; + } else if (filterHeight <= 8 && filterWidth <= 8) { + blockDimX = 8; + blockDimY = 8; + } else if (filterHeight <= 16 && filterWidth <= 16) { + blockDimX = 16; + blockDimY = 16; + } else { + blockDimX = 32; + blockDimY = 32; + } + + int blockDimZ = 1024 / blockDimX / blockDimY; + dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); + dim3 grid(outputWidth, outputHeight); + im2colOCF<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, colData, inputChannels, inputHeight, inputWidth, + filterHeight, filterWidth, strideHeight, strideWidth, + paddingHeight, paddingWidth, outputHeight, outputWidth); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; + +} // namespace paddle diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index ad34967bd6580..fe4c8fefcf5e8 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -291,5 +291,8 @@ class ImageExpandBackward : public ImageExpandFunction { REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward); REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward); +#endif } // namespace paddle diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index c8d0b21c8754d..1889b347c2d89 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -37,16 +37,16 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, imgSizeH_ = blockConf.img_size_y(); imgSizeW_ = blockConf.img_size_x(); + std::vector strides = {(size_t)strideH_, (size_t)strideW_}; + std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; + std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; + createFunction(forward_, + "ImageExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); if (!useGpu_) { - std::vector strides = {(size_t)strideH_, (size_t)strideW_}; - std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; - std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; - createFunction(forward_, - "ImageExpand", - FuncConfig() - .set("strides", strides) - .set("paddings", paddings) - .set("blocks", blocks)); createFunction(backward_, "ImageExpandGrad", FuncConfig() @@ -84,62 +84,29 @@ void BlockExpandLayer::forward(PassType passType) { size_t blockNum = getBlockNum(); size_t blockSize = blockH_ * blockW_ * channels_; resetOutput(blockNum * batchSize, blockSize); - // TODO(hedaoyuan): After completing the GPU version of ImageExpand, - // refactor the following code. - Argument& out = getOutput(); - MatrixPtr outV = getOutputValue(); - MatrixPtr input = getPrev(0)->getOutputValue(); - Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_); + // calculate output_.value + inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); + outputShape_ = TensorShape({batchSize, blockNum, blockSize}); + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), inputShape_); + outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); + forward_[0]->calc(inputs, outputs); + + // calculate output_.sequenceStartPositions and output_.cpuSequenceDims + Argument& out = getOutput(); ICpuGpuVector::resizeOrCreate( out.sequenceStartPositions, batchSize + 1, false); IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false); int* start = out.sequenceStartPositions->getMutableData(false); int* dims = out.cpuSequenceDims->getData(); for (size_t i = 0; i < batchSize; i++) { - if (useGpu_) { - outVTrans_->zeroMem(); - /* expand each block as one row */ - MatrixPtr inputTmp = - Matrix::create(input->getData() + i * input->getWidth(), - 1, - input->getWidth(), - false, - useGpu_); - outVTrans_->convExpand(*inputTmp, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_); - MatrixPtr outVTmp = - Matrix::create(outV->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - outVTrans_->transpose(outVTmp, false); - } start[i] = i * blockNum; dims[2 * i] = outputH_; dims[2 * i + 1] = outputW_; } start[batchSize] = batchSize * blockNum; - if (!useGpu_) { - inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_}); - outputShape_ = TensorShape({batchSize, blockNum, blockSize}); - BufferArgs inputs; - BufferArgs outputs; - inputs.addArg(*getInputValue(0), inputShape_); - outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO); - forward_[0]->calc(inputs, outputs); - } } void BlockExpandLayer::backward(const UpdateCallback& callback) { diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index edda0e0b630ae..15ce73ab8b2ca 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -50,9 +50,6 @@ class BlockExpandLayer : public Layer { size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_; size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_; - /// auxiliary variable, which saves the transposed output value. - MatrixPtr outVTrans_; - TensorShape inputShape_; TensorShape outputShape_; From f8ef8c174c442f14662a94e59fcda6587498c8a5 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 21:07:20 +0800 Subject: [PATCH 08/20] Add the GPU version implementation of ImageExpandGrad function. --- paddle/function/Im2ColOpGpu.cu | 107 +++++++++++++++++---- paddle/function/ImageExpandOp.cpp | 1 + paddle/gserver/layers/BlockExpandLayer.cpp | 33 +++---- 3 files changed, 103 insertions(+), 38 deletions(-) diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index 1dac2585db721..bddd8ffc7c0b4 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "Im2Col.h" +#include "hl_device_functions.cuh" namespace paddle { @@ -25,30 +26,29 @@ void im2colOCF(const T* imData, T* colData, int strideHeight, int strideWidth, int paddingHeight, int paddingWidth, int outputHeight, int outputWidth) { - int idx = threadIdx.x; - int idy = threadIdx.y; int swId = blockIdx.x; int shId = blockIdx.y; - for (int channelId = threadIdx.z; channelId < inputChannels; channelId += blockDim.z) { - int widthOffset = idx + swId * strideWidth - paddingWidth; - int heightOffset = idy + shId * strideHeight - paddingHeight; - int imOffset = widthOffset + heightOffset * inputWidth - + channelId * inputHeight * inputWidth; - - int colOffset = idx + idy * filterWidth - + channelId * filterHeight * filterWidth - + (shId * outputWidth + swId) - * (inputChannels * filterHeight * filterWidth); - - if (idx < filterWidth && idy < filterHeight) { - if (heightOffset >= inputHeight || heightOffset < 0 || - widthOffset >= inputWidth || widthOffset < 0) { - colData[colOffset] = T(0); - } else { - colData[colOffset] = imData[imOffset]; + for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (heightOffset >= inputHeight || heightOffset < 0 || + widthOffset >= inputWidth || widthOffset < 0) { + colData[colOffset] = T(0); + } else { + colData[colOffset] = imData[imOffset]; + } } } } @@ -105,6 +105,41 @@ public: } }; +template +__global__ +void col2imOCF(T* imData, const T* colData, + int inputChannels, + int inputHeight, int inputWidth, + int filterHeight, int filterWidth, + int strideHeight, int strideWidth, + int paddingHeight, int paddingWidth, + int outputHeight, int outputWidth) { + int swId = blockIdx.x; + int shId = blockIdx.y; + for (int channelId = threadIdx.z; + channelId < inputChannels; + channelId += blockDim.z) { + for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { + int widthOffset = idx + swId * strideWidth - paddingWidth; + int heightOffset = idy + shId * strideHeight - paddingHeight; + int imOffset = widthOffset + heightOffset * inputWidth + + channelId * inputHeight * inputWidth; + + int colOffset = idx + idy * filterWidth + + channelId * filterHeight * filterWidth + + (shId * outputWidth + swId) + * (inputChannels * filterHeight * filterWidth); + + if (heightOffset >= 0 && heightOffset < inputHeight && + widthOffset >= 0 && widthOffset < inputWidth) { + paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]); + } + } + } + } +} + /* * imShape = [inputChannels, inputHeight, inputWidth] * colShape = @@ -121,10 +156,44 @@ public: int strideWidth, int paddingHeight, int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int blockDimX = 0; + int blockDimY = 0; + if (filterHeight <= 4 && filterWidth <= 4) { + blockDimX = 4; + blockDimY = 4; + } else if (filterHeight <= 8 && filterWidth <= 8) { + blockDimX = 8; + blockDimY = 8; + } else if (filterHeight <= 16 && filterWidth <= 16) { + blockDimX = 16; + blockDimY = 16; + } else { + blockDimX = 32; + blockDimY = 32; + } + + int blockDimZ = 1024 / blockDimX / blockDimY; + dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels)); + dim3 grid(outputWidth, outputHeight); + col2imOCF<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, colData, inputChannels, inputHeight, inputWidth, + filterHeight, filterWidth, strideHeight, strideWidth, + paddingHeight, paddingWidth, outputHeight, outputWidth); + CHECK_SYNC("Col2ImFunctor GPU failed"); } }; template class Im2ColFunctor; template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; } // namespace paddle diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index fe4c8fefcf5e8..f227f6d0e10fa 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -293,6 +293,7 @@ REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward); REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward); #ifndef PADDLE_ONLY_CPU REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward); +REGISTER_TYPED_FUNC(ImageExpandGrad, GPU, ImageExpandBackward); #endif } // namespace paddle diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index 1889b347c2d89..a5e644a4ae397 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -46,14 +46,12 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, .set("strides", strides) .set("paddings", paddings) .set("blocks", blocks)); - if (!useGpu_) { - createFunction(backward_, - "ImageExpandGrad", - FuncConfig() - .set("strides", strides) - .set("paddings", paddings) - .set("blocks", blocks)); - } + createFunction(backward_, + "ImageExpandGrad", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); return true; } @@ -110,14 +108,16 @@ void BlockExpandLayer::forward(PassType passType) { } void BlockExpandLayer::backward(const UpdateCallback& callback) { - size_t blockNum = outputH_ * outputW_; - size_t blockSize = blockH_ * blockW_ * channels_; /* Calculate the input layers error */ - MatrixPtr preGrad = inputLayers_[0]->getOutputGrad(); - if (!preGrad) { - return; + if (getInputGrad(0)) { + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), outputShape_); + outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO); + backward_[0]->calc(inputs, outputs); } +#if 0 if (useGpu_) { MatrixPtr grad = getOutputGrad(); MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_); @@ -155,13 +155,8 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) { 1.0, 1.0); } - } else { - BufferArgs inputs; - BufferArgs outputs; - inputs.addArg(*getOutputGrad(), outputShape_); - outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO); - backward_[0]->calc(inputs, outputs); } +#endif } } // namespace paddle From bf6dfc1ff2a01cc35bf6a91177463cd40e328003 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 21:30:32 +0800 Subject: [PATCH 09/20] Remove some of the code that has been commented out. --- paddle/gserver/layers/BlockExpandLayer.cpp | 41 ---------------------- 1 file changed, 41 deletions(-) diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index a5e644a4ae397..adc9a814ffbb1 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -116,47 +116,6 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) { outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO); backward_[0]->calc(inputs, outputs); } - -#if 0 - if (useGpu_) { - MatrixPtr grad = getOutputGrad(); - MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_); - size_t batchSize = preGrad->getHeight(); - - CHECK_EQ(batchSize * blockNum, grad->getHeight()); - CHECK_EQ(blockSize, grad->getWidth()); - - for (size_t i = 0; i < batchSize; i++) { - MatrixPtr gradTmp = - Matrix::create(grad->getData() + i * blockNum * blockSize, - blockNum, - blockSize, - false, - useGpu_); - gradTmp->transpose(gradTrans, false); - MatrixPtr preGradTmp = - Matrix::create(preGrad->getData() + i * preGrad->getWidth(), - 1, - preGrad->getWidth(), - false, - useGpu_); - preGradTmp->convShrink(*gradTrans, - imgSizeH_, - imgSizeW_, - channels_, - blockH_, - blockW_, - strideH_, - strideW_, - paddingH_, - paddingW_, - outputH_, - outputW_, - 1.0, - 1.0); - } - } -#endif } } // namespace paddle From d558b8bb82d6428b58f7ceb60ea87afcadce03ba Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 10:36:36 +0800 Subject: [PATCH 10/20] Move the code in the GemmConvOpGpu.cu file into Im2ColOpGpu.cu. --- paddle/function/Im2ColOpGpu.cu | 172 +++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index bddd8ffc7c0b4..361ecc4401a16 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -17,6 +17,178 @@ limitations under the License. */ namespace paddle { +template +__global__ +void im2col(const T* data_im, int numOuts, int height, int width, + int blockH, int blockW, + int strideH, int strideW, + int paddingH, int paddingW, + int height_col, int width_col, + T* data_col) { + int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < numOuts) { + int w_out = index % width_col; + index /= width_col; + int h_out = index % height_col; + int channel_in = index / height_col; + int channel_out = channel_in * blockH * blockW; + int h_in = h_out * strideH; + int w_in = w_out * strideW; + + data_col += (channel_out * height_col + h_out) * width_col + w_out; + for (int i = 0; i < blockH; ++i) { + for (int j = 0; j < blockW; ++j) { + int rIdx = int(h_in+i); + int cIdx = int(w_in+j); + if ((rIdx-(int)paddingH) >= (int)height || + (rIdx-(int)paddingH) < 0 || + (cIdx-(int)paddingW) >= (int)width || + (cIdx-(int)paddingW) < 0) { + *data_col = 0; + } else { + rIdx = rIdx + channel_in*height - paddingH; + cIdx = cIdx - paddingW; + *data_col = data_im[rIdx* width + cIdx]; + } + data_col += height_col * width_col; + } + } + } +} + +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + int numKernels = inputChannels * outputHeight * outputWidth; + int blocks = (numKernels + 1024 -1) / 1024; + int blockX = 512; + int blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + im2col<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, + strideHeight, strideWidth, paddingHeight, paddingWidth, + outputHeight, outputWidth, colData); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +template +__global__ +void col2im(size_t n, const T* data_col, size_t height, + size_t width, size_t channels, + size_t blockH, size_t blockW, + size_t strideH, size_t strideW, + size_t paddingH, size_t paddingW, + size_t height_col, size_t width_col, + T* data_im) { + size_t index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < n) { + T val = 0; + int w = int(index % width); + int h = int((index / width) % height); + int c = int(index / (width * height)); + if ((w - (int)paddingW) >= 0 && + (w - (int)paddingW) < (width-2 * paddingW) && + (h - (int)paddingH) >= 0 && + (h - paddingH) < (height - 2 * paddingH)) { + // compute the start and end of the output + int w_col_start = + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = + min((int)(w / (int)strideW + 1), (int)(width_col)); + int h_col_start = + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + int h_col_end = min(int(h / strideH + 1), int(height_col)); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + // the col location: [c * width * height + h_out, w_out] + int c_col = int(c * blockH* blockW) + \ + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); + val += data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + h -= paddingH; + w -= paddingW; + data_im[c*((width-2*paddingW) * (height-2*paddingH)) + + h*(width-2*paddingW) + w] += val; + } + } +} + +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + + size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) + * (inputWidth + 2*paddingWidth); + + size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks+512-1)/512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<< grid, threads, 0, STREAM_DEFAULT >>> + (numKernels, + colData, + inputHeight + 2*paddingHeight, + inputWidth + 2*paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); + CHECK_SYNC("Col2ImFunctor GPU failed"); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + template __global__ void im2colOCF(const T* imData, T* colData, From eb0c7e5ebc9a8c267cf4dc399beeb6b93dcbe6c6 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 11:03:30 +0800 Subject: [PATCH 11/20] Move the Im2Col code of the CPU version into the Im2ColOp.cpp file. --- paddle/function/Im2ColOp.cpp | 235 ++++++++++++++++++++++++++++++ paddle/function/Im2ColOpGpu.cu | 26 +++- paddle/function/ImageExpandOp.cpp | 108 -------------- 3 files changed, 253 insertions(+), 116 deletions(-) create mode 100644 paddle/function/Im2ColOp.cpp diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp new file mode 100644 index 0000000000000..b7d1eb1eded7a --- /dev/null +++ b/paddle/function/Im2ColOp.cpp @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" + +namespace paddle { + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterWidth / filterHeight; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) < 0 || + (imRowIdx - paddingHeight) >= inputHeight || + (imColIdx - paddingWidth) < 0 || + (imColIdx - paddingWidth) >= inputWidth) { + colData[(c * outputHeight + h) * outputWidth + w] = T(0); + } else { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + colData[(c * outputHeight + h) * outputWidth + w] = + imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + } + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterWidth / filterHeight; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) >= 0 && + (imRowIdx - paddingHeight) < inputHeight && + (imColIdx - paddingWidth) >= 0 && + (imColIdx - paddingWidth) < inputWidth) { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + imData[imRowIdx * inputWidth + imColIdx] += + colData[(c * outputHeight + h) * outputWidth + w]; + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset < 0 || imRowOffset >= inputHeight || + imColOffset < 0 || imColOffset >= inputWidth) { + colData[colDataOffset] = float(0); + } else { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + colData[colDataOffset] = imData[imDataOffset]; + } + } + } + } + } + } + } +}; + +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] + */ +template +class Col2ImFunctor { +public: + void operator()(T* imData, + const TensorShape& imShape, + const T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth) { + int inputChannels = imShape[0]; + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[3]; + int filterWidth = colShape[4]; + int outputHeight = colShape[0]; + int outputWidth = colShape[1]; + for (int outputH = 0; outputH < outputHeight; ++outputH) { + for (int outputW = 0; outputW < outputWidth; ++outputW) { + for (int channel = 0; channel < inputChannels; ++channel) { + for (int filterH = 0; filterH < filterHeight; ++filterH) { + for (int filterW = 0; filterW < filterWidth; ++filterW) { + int imRowOffset = + outputH * strideHeight + filterH - paddingHeight; + int imColOffset = outputW * strideWidth + filterW - paddingWidth; + int colDataOffset = + (((outputH * outputWidth + outputW) * inputChannels + + channel) * + filterHeight + + filterH) * + filterWidth + + filterW; + if (imRowOffset >= 0 && imRowOffset < inputHeight && + imColOffset >= 0 && imColOffset < inputWidth) { + int imDataOffset = + (channel * inputHeight + imRowOffset) * inputWidth + + imColOffset; + imData[imDataOffset] += colData[colDataOffset]; + } + } + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace paddle diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu index 361ecc4401a16..15ba854009636 100644 --- a/paddle/function/Im2ColOpGpu.cu +++ b/paddle/function/Im2ColOpGpu.cu @@ -57,6 +57,11 @@ void im2col(const T* data_im, int numOuts, int height, int width, } } +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ template class Im2ColFunctor { public: @@ -71,10 +76,10 @@ public: int inputChannels = imShape[0]; int inputHeight = imShape[1]; int inputWidth = imShape[2]; - int filterHeight = colShape[3]; - int filterWidth = colShape[4]; - int outputHeight = colShape[0]; - int outputWidth = colShape[1]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; int numKernels = inputChannels * outputHeight * outputWidth; int blocks = (numKernels + 1024 -1) / 1024; @@ -135,6 +140,11 @@ void col2im(size_t n, const T* data_col, size_t height, } } +/* + * imShape = [inputChannels, inputHeight, inputWidth] + * colShape = + * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth] + */ template class Col2ImFunctor { public: @@ -149,10 +159,10 @@ public: int inputChannels = imShape[0]; int inputHeight = imShape[1]; int inputWidth = imShape[2]; - int filterHeight = colShape[3]; - int filterWidth = colShape[4]; - int outputHeight = colShape[0]; - int outputWidth = colShape[1]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputHeight = colShape[3]; + int outputWidth = colShape[4]; size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) * (inputWidth + 2*paddingWidth); diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index f227f6d0e10fa..625bf5b6edf44 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -17,114 +17,6 @@ limitations under the License. */ namespace paddle { -/* - * imShape = [inputChannels, inputHeight, inputWidth] - * colShape = - * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - const TensorShape& imShape, - T* colData, - const TensorShape& colShape, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth) { - int inputChannels = imShape[0]; - int inputHeight = imShape[1]; - int inputWidth = imShape[2]; - int filterHeight = colShape[3]; - int filterWidth = colShape[4]; - int outputHeight = colShape[0]; - int outputWidth = colShape[1]; - for (int outputH = 0; outputH < outputHeight; ++outputH) { - for (int outputW = 0; outputW < outputWidth; ++outputW) { - for (int channel = 0; channel < inputChannels; ++channel) { - for (int filterH = 0; filterH < filterHeight; ++filterH) { - for (int filterW = 0; filterW < filterWidth; ++filterW) { - int imRowOffset = - outputH * strideHeight + filterH - paddingHeight; - int imColOffset = outputW * strideWidth + filterW - paddingWidth; - int colDataOffset = - (((outputH * outputWidth + outputW) * inputChannels + - channel) * - filterHeight + - filterH) * - filterWidth + - filterW; - if (imRowOffset < 0 || imRowOffset >= inputHeight || - imColOffset < 0 || imColOffset >= inputWidth) { - colData[colDataOffset] = float(0); - } else { - int imDataOffset = - (channel * inputHeight + imRowOffset) * inputWidth + - imColOffset; - colData[colDataOffset] = imData[imDataOffset]; - } - } - } - } - } - } - } -}; - -/* - * imShape = [inputChannels, inputHeight, inputWidth] - * colShape = - * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth] - */ -template -class Col2ImFunctor { -public: - void operator()(T* imData, - const TensorShape& imShape, - const T* colData, - const TensorShape& colShape, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth) { - int inputChannels = imShape[0]; - int inputHeight = imShape[1]; - int inputWidth = imShape[2]; - int filterHeight = colShape[3]; - int filterWidth = colShape[4]; - int outputHeight = colShape[0]; - int outputWidth = colShape[1]; - for (int outputH = 0; outputH < outputHeight; ++outputH) { - for (int outputW = 0; outputW < outputWidth; ++outputW) { - for (int channel = 0; channel < inputChannels; ++channel) { - for (int filterH = 0; filterH < filterHeight; ++filterH) { - for (int filterW = 0; filterW < filterWidth; ++filterW) { - int imRowOffset = - outputH * strideHeight + filterH - paddingHeight; - int imColOffset = outputW * strideWidth + filterW - paddingWidth; - int colDataOffset = - (((outputH * outputWidth + outputW) * inputChannels + - channel) * - filterHeight + - filterH) * - filterWidth + - filterW; - if (imRowOffset >= 0 && imRowOffset < inputHeight && - imColOffset >= 0 && imColOffset < inputWidth) { - int imDataOffset = - (channel * inputHeight + imRowOffset) * inputWidth + - imColOffset; - imData[imDataOffset] += colData[colDataOffset]; - } - } - } - } - } - } - } -}; - /* * \brief Converts the image data of four dimensions(NCHW) into * a sequence data of three dimensions(NST) in the forward calculation, From 07cde439aae38137c42f662382e36d08c03d37fd Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 11:18:58 +0800 Subject: [PATCH 12/20] Reconstruction of GemmConv Based on new im2col. --- paddle/function/GemmConvOp.cpp | 185 +++++++++------------------------ 1 file changed, 48 insertions(+), 137 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index a40e5d9d2e766..3f10bb9c83754 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "GemmConvOp.h" +#include "ConvOp.h" #include "GemmFunctor.h" +#include "Im2Col.h" #include "paddle/math/MemoryHandle.h" namespace paddle { -/* - * imData = [input_channels, input_height, input_width] - * colData = [input_channels, filter_height, filter_width, - * output_height, output_width] - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData) { - int channelsCol = inputChannels * filterHeight * filterWidth; - - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % filterWidth; - int hOffset = (c / filterWidth) % filterHeight; - int c_im = c / filterWidth / filterHeight; - for (int h = 0; h < outputHeight; ++h) { - for (int w = 0; w < outputWidth; ++w) { - int imRowIdx = h * strideHeight + hOffset; - int imColIdx = w * strideWidth + wOffset; - if ((imRowIdx - paddingHeight) < 0 || - (imRowIdx - paddingHeight) >= inputHeight || - (imColIdx - paddingWidth) < 0 || - (imColIdx - paddingWidth) >= inputWidth) { - colData[(c * outputHeight + h) * outputWidth + w] = T(0); - } else { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - colData[(c * outputHeight + h) * outputWidth + w] = - imData[imRowIdx * inputWidth + imColIdx]; - } - } - } - } - } -}; - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData) { - int channelsCol = inputChannels * filterHeight * filterWidth; - - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % filterWidth; - int hOffset = (c / filterWidth) % filterHeight; - int c_im = c / filterWidth / filterHeight; - for (int h = 0; h < outputHeight; ++h) { - for (int w = 0; w < outputWidth; ++w) { - int imRowIdx = h * strideHeight + hOffset; - int imColIdx = w * strideWidth + wOffset; - if ((imRowIdx - paddingHeight) >= 0 && - (imRowIdx - paddingHeight) < inputHeight && - (imColIdx - paddingWidth) >= 0 && - (imColIdx - paddingWidth) < inputWidth) { - imRowIdx += c_im * inputHeight - paddingHeight; - imColIdx -= paddingWidth; - imData[imRowIdx * inputWidth + imColIdx] += - colData[(c * outputHeight + h) * outputWidth + w]; - } - } - } - } - } -}; - /* * \brief Forward calculation of convolution. */ @@ -155,15 +67,20 @@ class GemmConvFunction : public ConvFunctionBase { real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Im2ColFunctor im2col; + Im2ColFunctor im2col; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; @@ -171,18 +88,13 @@ class GemmConvFunction : public ConvFunctionBase { for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - colData); + paddingW()); int M = outputChannels / groups_; int N = outputHeight * outputWidth; @@ -249,15 +161,20 @@ class GemmConvGradInputFunction : public ConvFunctionBase { real* outputGrad = inputs[0].data(); real* filterData = inputs[1].data(); real* inputGrad = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Col2ImFunctor col2im; + Col2ImFunctor col2im; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; @@ -280,20 +197,14 @@ class GemmConvGradInputFunction : public ConvFunctionBase { 0.0f, colData, N); - - col2im(colData, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + col2im(inputGrad + g * inputOffset, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - inputGrad + g * inputOffset); + paddingW()); } inputGrad += inputChannels * inputHeight * inputWidth; outputGrad += outputChannels * outputHeight * outputWidth; @@ -347,33 +258,33 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { real* outputGrad = inputs[0].data(); real* inputData = inputs[1].data(); real* filterGrad = outputs[0].data(); - - size_t size = inputChannels / groups_ * filterHeight * filterWidth * - outputHeight * outputWidth; - resizeBuffer(size); + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + TensorShape colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(colShape.getElements()); real* colData = reinterpret_cast(memory_->getBuf()); - Im2ColFunctor im2col; + Im2ColFunctor im2col; GemmFunctor gemm; - size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, - inputChannels / groups_, - inputHeight, - inputWidth, - filterHeight, - filterWidth, + imShape, + colData, + colShape, strideH(), strideW(), paddingH(), - paddingW(), - outputHeight, - outputWidth, - colData); + paddingW()); int M = outputChannels / groups_; int K = outputHeight * outputWidth; From 9e6ed83cc4295414436ab784db10bf715637cddf Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 11:26:40 +0800 Subject: [PATCH 13/20] Fix ImageExpandFunction. --- paddle/function/ImageExpandOp.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index 625bf5b6edf44..ca1d117db8845 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -45,9 +45,7 @@ class ImageExpandFunction : public FunctionBase { numOutputs_ = 1; } - virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {} - - void check(const TensorShape& image, const TensorShape& sequence) const { + void checkShape(const TensorShape& image, const TensorShape& sequence) const { // image shape should be 4-dimensional. CHECK_EQ(image.ndims(), (size_t)4); // sequence shape should be 3-dimensional. @@ -108,12 +106,18 @@ class ImageExpandForward : public ImageExpandFunction { ImageExpandFunction::init(config); } + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& image = inputs[0].shape(); + const TensorShape& sequence = outputs[0].shape(); + checkShape(image, sequence); + } + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); const TensorShape& image = inputs[0].shape(); const TensorShape& sequence = outputs[0].shape(); - check(image, sequence); TensorShape imShape = TensorShape({image[1], image[2], image[3]}); TensorShape colShape = getColShape(image, sequence); @@ -149,15 +153,21 @@ class ImageExpandBackward : public ImageExpandFunction { ImageExpandFunction::init(config); } + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& image = outputs[0].shape(); + const TensorShape& sequence = inputs[0].shape(); + checkShape(image, sequence); + } + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); // Since the implementation of Col2ImFunctor is ADD_TO, // this function only supports ADD_TO mode. CHECK_EQ(outputs[0].getArgType(), ADD_TO); const TensorShape& image = outputs[0].shape(); const TensorShape& sequence = inputs[0].shape(); - check(image, sequence); TensorShape imShape = TensorShape({image[1], image[2], image[3]}); TensorShape colShape = getColShape(image, sequence); From 5bfcb7f853834009facd51ce5e2a989240bc3fcc Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 11:31:08 +0800 Subject: [PATCH 14/20] Remove useless code. --- paddle/function/GemmConvOp.h | 62 ----------- paddle/function/GemmConvOpGpu.cu | 186 ------------------------------- 2 files changed, 248 deletions(-) delete mode 100644 paddle/function/GemmConvOp.h delete mode 100644 paddle/function/GemmConvOpGpu.cu diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h deleted file mode 100644 index 9f11cce597a07..0000000000000 --- a/paddle/function/GemmConvOp.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "ConvOp.h" - -namespace paddle { - -/* - * imData = [input_channels, input_height, input_width] - * colData = [input_channels, filter_height, filter_width, - * output_height, output_width] - */ -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData); -}; - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData); -}; - -} // namespace paddle diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu deleted file mode 100644 index 2a1795ff0fb56..0000000000000 --- a/paddle/function/GemmConvOpGpu.cu +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ConvOp.h" -#include "GemmConvOp.h" - -namespace paddle { - -template -__global__ -void im2col(const T* data_im, int numOuts, int height, int width, - int blockH, int blockW, - int strideH, int strideW, - int paddingH, int paddingW, - int height_col, int width_col, - T* data_col) { - int index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < numOuts) { - int w_out = index % width_col; - index /= width_col; - int h_out = index % height_col; - int channel_in = index / height_col; - int channel_out = channel_in * blockH * blockW; - int h_in = h_out * strideH; - int w_in = w_out * strideW; - - data_col += (channel_out * height_col + h_out) * width_col + w_out; - for (int i = 0; i < blockH; ++i) { - for (int j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { - *data_col = 0; - } else { - rIdx = rIdx + channel_in*height - paddingH; - cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; - } - data_col += height_col * width_col; - } - } - } -} - -template -class Im2ColFunctor { -public: - void operator()(const T* imData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* colData) { - int numKernels = inputChannels * outputHeight * outputWidth; - int blocks = (numKernels + 1024 -1) / 1024; - int blockX = 512; - int blockY = (blocks + 512 - 1) / 512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - im2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, - strideHeight, strideWidth, paddingHeight, paddingWidth, - outputHeight, outputWidth, colData); - CHECK_SYNC("Im2ColFunctor GPU failed"); - } -}; - -template -__global__ -void col2im(size_t n, const T* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - T* data_im) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - T val = 0; - int w = int(index % width); - int h = int((index / width) % height); - int c = int(index / (width * height)); - if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { - // compute the start and end of the output - int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); - int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; - int h_col_end = min(int(h / strideH + 1), int(height_col)); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - h -= paddingH; - w -= paddingW; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] += val; - } - } -} - -template -class Col2ImFunctor { -public: - void operator()(const T* colData, - int inputChannels, - int inputHeight, - int inputWidth, - int filterHeight, - int filterWidth, - int strideHeight, - int strideWidth, - int paddingHeight, - int paddingWidth, - int outputHeight, - int outputWidth, - T* imData) { - size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) - * (inputWidth + 2*paddingWidth); - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - col2im<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, - colData, - inputHeight + 2*paddingHeight, - inputWidth + 2*paddingWidth, - inputChannels, - filterHeight, - filterWidth, - strideHeight, - strideWidth, - paddingHeight, - paddingWidth, - outputHeight, - outputWidth, - imData); - CHECK_SYNC("Col2ImFunctor GPU failed"); - } -}; - -template class Im2ColFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; - -} // namespace paddle From 09d712d6aec0376b5ccea09e0d2c546ea1149aba Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 11:38:25 +0800 Subject: [PATCH 15/20] Remove useless code(Matrix::convExpand and Matrix::convShrink). --- paddle/cuda/include/hl_cnn.h | 67 ---------- paddle/cuda/include/stub/hl_cnn_stub.h | 30 ----- paddle/cuda/src/hl_cuda_cnn.cu | 128 ------------------ paddle/math/Matrix.cpp | 172 ------------------------- paddle/math/Matrix.h | 99 -------------- 5 files changed, 496 deletions(-) diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index f55197c8c9ebb..9f84db72da24b 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -17,73 +17,6 @@ limitations under the License. */ #include "hl_base.h" -/** - * @brief Shrink column to feature. - * - * @param[in] dataCol expand data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] blockH filter height. - * @param[in] blockW filter width. - * @param[in] strideH stride height. - * @param[in] strideW stride width. - * @param[in] paddingH padding height. - * @param[in] paddingW padding width. - * @param[in] outputH output height. - * @param[in] outputW output width. - * @param[out] dataIm output image data. - * @param[in] alpha - * @param[in] beta - */ -extern void hl_shrink_col2feature(const real* dataCol, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataIm, - real alpha = 1.0f, - real beta = 0.0f); - -/** - * @brief Expand feature to column. - * - * @param[in] dataIm input image data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] blockH filter height. - * @param[in] blockW filter width. - * @param[in] strideH stride height. - * @param[in] strideW stride width. - * @param[in] paddingH padding height. - * @param[in] paddingW padding width. - * @param[in] outputH output height. - * @param[in] outputW output width. - * @param[out] dataCol expand data. - * - */ -extern void hl_expand_feature2col(const real* dataIm, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataCol); - /** * @brief Maximum pool forward. * diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index 039551c6cc695..2bbb9fa8dfd5e 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -17,36 +17,6 @@ limitations under the License. */ #include "hl_cnn.h" -inline void hl_shrink_col2feature(const real* dataCol, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataIm, - real alpha, - real beta) {} - -inline void hl_expand_feature2col(const real* dataIm, - size_t channels, - size_t height, - size_t width, - size_t blockH, - size_t blockW, - size_t strideH, - size_t strideW, - size_t paddingH, - size_t paddingW, - size_t outputH, - size_t outputW, - real* dataCol) {} - inline void hl_maxpool_forward(const int frameCnt, const real* inputData, const int channels, diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index b94f4d8fe4a25..b6e3e63a4f522 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -18,134 +18,6 @@ limitations under the License. */ #include "hl_cnn.h" #include "hl_device_functions.cuh" -__global__ void KeFeature2col(size_t n, size_t height, const real* data_im, - size_t blockH, size_t blockW, size_t width, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - real* data_col) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - size_t w_out = index % width_col; - index /= width_col; - size_t h_out = index % height_col; - size_t channel_in = index / height_col; - size_t channel_out = channel_in * blockH * blockW; - size_t h_in = h_out * strideH; - size_t w_in = w_out * strideW; - - data_col += (channel_out * height_col + h_out) * width_col + w_out; - for (size_t i = 0; i < blockH; ++i) { - for (size_t j = 0; j < blockW; ++j) { - int rIdx = int(h_in+i); - int cIdx = int(w_in+j); - if ((rIdx-(int)paddingH) >= (int)height || - (rIdx-(int)paddingH) < 0 || - (cIdx-(int)paddingW) >= (int)width || - (cIdx-(int)paddingW) < 0) { - *data_col = 0; - } else { - rIdx = rIdx + channel_in*height - paddingH; - cIdx = cIdx - paddingW; - *data_col = data_im[rIdx* width + cIdx]; - } - data_col += height_col * width_col; - } - } - } -} - -void hl_expand_feature2col(const real* dataIm, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataCol) { - size_t numKernels = channels * outputH * outputW; - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, height, dataIm, blockH, blockW, width, - strideH, strideW, paddingH, paddingW, - outputH, outputW, dataCol); - CHECK_SYNC("hl_expand_feature2col failed"); -} - -__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height, - size_t width, size_t channels, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t height_col, size_t width_col, - real* data_im, real alpha, real beta) { - size_t index = - (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < n) { - real val = 0; - int w = int(index % width); - int h = int((index / width) % height); - int c = int(index / (width * height)); - if ((w - (int)paddingW) >= 0 && - (w - (int)paddingW) < (width-2 * paddingW) && - (h - (int)paddingH) >= 0 && - (h - paddingH) < (height - 2 * paddingH)) { - // compute the start and end of the output - int w_col_start = - (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; - int w_col_end = - min((int)(w / (int)strideW + 1), (int)(width_col)); - int h_col_start = - (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; - int h_col_end = min(int(h / strideH + 1), int(height_col)); - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = int(c * blockH* blockW) + \ - (h - h_col * (int)strideH) * (int)blockW + - (w - w_col * (int)strideW); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - h -= paddingH; - w -= paddingW; - real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w]; - data_im[c*((width-2*paddingW) * (height-2*paddingH)) + - h*(width-2*paddingW) + w] = alpha * val + beta*tD; - } - } -} - -void hl_shrink_col2feature(const real * dataCol, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataIm, real alpha, real beta) { - size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW); - - size_t blocks = (numKernels + 1024 -1) / 1024; - size_t blockX = 512; - size_t blockY = (blocks+512-1)/512; - dim3 threads(1024, 1); - dim3 grid(blockX, blockY); - - // To avoid involving atomic operations, we will launch one kernel per - // bottom dimension, and then in the kernel add up the top dimensions. - KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>> - (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW, - channels, blockH, blockW, strideH, strideW, paddingH, paddingW, - outputH, outputW, dataIm, alpha, beta); - CHECK_SYNC("hl_shrink_col2feature failed"); -} - __global__ void KeMaxPoolForward(const int nthreads, const real* inputData, const int channels, const int height, const int width, diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index c910146164ebf..a3ad9d46e4988 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) { LOG(INFO) << "the diffCnt is " << diffCnt; } -void GpuMatrix::convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - CHECK(feature.useGpu_ == true) << "Matrix type are not equal"; - - CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels), - feature.getHeight() * feature.getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal"; - - hl_expand_feature2col(feature.getData(), - channels, - feaImgHeight, - feaImgWidth, - blockH, - blockW, - strideH, - strideW, - paddingH, - paddingW, - outputH, - outputW, - getData()); -} - -void GpuMatrix::convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha, - real beta) { - CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal"; - CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), - getHeight() * getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockW * blockH * channels; - CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth()) - << "Matrix dimensions are not equal"; - hl_shrink_col2feature(expandFeat.getData(), - channels, - thisImgHeight, - thisImgWidth, - blockH, - blockW, - strideH, - strideW, - paddingH, - paddingW, - outputH, - outputW, - getData(), - alpha, - beta); -} - void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1775,103 +1700,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) { CHECK_EQ(info, 0); } -void CpuMatrix::convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - CHECK(feature.useGpu_ == false) << "Matrix type are not equal"; - - CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels), - feature.getHeight() * feature.getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal"; - - int channelsCol = channels * blockH * blockW; - real* srcData = feature.getData(); - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % blockW; - int hOffset = (c / blockW) % blockH; - int c_im = c / blockH / blockW; - for (int h = 0; h < outputH; ++h) { - for (int w = 0; w < outputW; ++w) { - // no c_im*height to Exclude the channel number - int imgRowIdx = h * strideH + hOffset; - int imgColIdx = w * strideW + wOffset; - if ((imgRowIdx - paddingH) < 0 || - (imgRowIdx - paddingH) >= feaImgHeight || - (imgColIdx - paddingW) < 0 || - (imgColIdx - paddingW) >= feaImgWidth) { - data_[(c * outputH + h) * outputW + w] = 0; - } else { - imgRowIdx += c_im * feaImgHeight - paddingH; - imgColIdx -= paddingW; - data_[(c * outputH + h) * outputW + w] = - srcData[imgRowIdx * feaImgWidth + imgColIdx]; - } - } - } - } -} - -void CpuMatrix::convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha, - real beta) { - CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal"; - CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), - getHeight() * getWidth()) - << "Matrix dimensions are not equal"; - - size_t elemCnt = outputH * outputW * blockH * blockW * channels; - - CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth()) - << "Matrix dimensions are not equal"; - - real* expandData = expandFeat.getData(); - int channelsCol = channels * blockH * blockW; - for (int c = 0; c < channelsCol; ++c) { - int wOffset = c % blockW; - int hOffset = (c / blockW) % blockH; - int c_im = c / blockW / blockH; - for (int h = 0; h < outputH; ++h) { - for (int w = 0; w < outputW; ++w) { - int imRowIdx = h * strideH + hOffset; - int imColIdx = w * strideW + wOffset; - if ((imRowIdx - paddingH) >= 0 && - (imRowIdx - paddingH) < thisImgHeight && - (imColIdx - paddingW) >= 0 && - (imColIdx - paddingW) < thisImgWidth) { - imRowIdx += c_im * thisImgHeight - paddingH; - imColIdx -= paddingW; - data_[imRowIdx * thisImgWidth + imColIdx] = - alpha * expandData[(c * outputH + h) * outputW + w] + - beta * data_[imRowIdx * thisImgWidth + imColIdx]; - } - } - } - } -} - void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 748be850b4c90..bbf98a609c63e 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -858,49 +858,6 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemented"; } - /** - * This function is used to calculate the convolution: - * - * It will expand a feature matrix according to the - * convolution filters - */ - virtual void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW) { - LOG(FATAL) << "Not implemeted"; - } - - /** - * This function is the reverse implementation of convExpand: - * - * Its function is to restore a expanded-matrix into a feature matrix - */ - virtual void convShrink(Matrix& expandColMat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f) { - LOG(FATAL) << "Not implemeted"; - } - /** * Pooling forward operation, pick out the largest element * in the sizeX of value @@ -1334,34 +1291,6 @@ class GpuMatrix : public Matrix { void classificationError(Matrix& output, IVector& label, size_t topkSize = 1); - void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW); - - void convShrink(Matrix& expandColMat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blochW, - int strideH, - int strideW, - int paddingH, - int paddingWreal, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f); - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, @@ -1521,34 +1450,6 @@ class CpuMatrix : public Matrix { MatrixPtr clone(size_t height, size_t width, bool useGpu = false); - void convExpand(Matrix& feature, - int feaImgHeight, - int feaImgWidth, - int channels, - int blcokH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW); - - void convShrink(Matrix& expandFeat, - int thisImgHeight, - int thisImgWidth, - int channels, - int blockH, - int blockW, - int strideH, - int strideW, - int paddingH, - int paddingW, - int outputH, - int outputW, - real alpha = 1.0f, - real beta = 0.0f); - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, From 86a679b0c485cac9df354e2c37abaacc8ea9771d Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 21 Jun 2017 17:07:55 +0800 Subject: [PATCH 16/20] Add unit test of ImageExpandOp. --- paddle/function/CMakeLists.txt | 1 + paddle/function/ImageExpandOp.cpp | 1 + paddle/function/ImageExpandOpTest.cpp | 107 ++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 paddle/function/ImageExpandOpTest.cpp diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 5e170714cf5b1..19f64eefd184e 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -29,6 +29,7 @@ if(WITH_GPU) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) add_simple_unittest(RowConvOpTest) + add_simple_unittest(ImageExpandOpTest) endif() add_simple_unittest(ConvOpTest) diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index ca1d117db8845..00a2571936b39 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -116,6 +116,7 @@ class ImageExpandForward : public ImageExpandFunction { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); check(inputs, outputs); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); const TensorShape& image = inputs[0].shape(); const TensorShape& sequence = outputs[0].shape(); diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/ImageExpandOpTest.cpp new file mode 100644 index 0000000000000..fb312549dc758 --- /dev/null +++ b/paddle/function/ImageExpandOpTest.cpp @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +namespace paddle { + +TEST(ImageExpandForward, real) { + for (size_t batchSize : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t block : {1, 3, 5}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + // init Test object + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector blocks = {block, block}; + CpuGpuFuncCompare test("ImageExpand", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + + size_t outputHeight = + 1 + + (inputHeight + 2 * padding - block + stride - 1) / stride; + size_t outputWidth = + 1 + + (inputWidth + 2 * padding - block + stride - 1) / stride; + TensorShape inputShape = + TensorShape({batchSize, channels, inputHeight, inputWidth}); + TensorShape outputShape = + TensorShape({batchSize, + outputHeight * outputWidth, + channels * block * block}); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape)); + // run Function + test.run(); + } + } + } + } + } + } + } +} + +TEST(ImageExpandBackward, real) { + for (size_t batchSize : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t block : {1, 3, 5}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + // init Test object + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector blocks = {block, block}; + CpuGpuFuncCompare test("ImageExpandGrad", + FuncConfig() + .set("strides", strides) + .set("paddings", paddings) + .set("blocks", blocks)); + + size_t outputHeight = + 1 + + (inputHeight + 2 * padding - block + stride - 1) / stride; + size_t outputWidth = + 1 + + (inputWidth + 2 * padding - block + stride - 1) / stride; + TensorShape inputShape = + TensorShape({batchSize, channels, inputHeight, inputWidth}); + TensorShape outputShape = + TensorShape({batchSize, + outputHeight * outputWidth, + channels * block * block}); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape), + ADD_TO); + // run Function + test.run(); + } + } + } + } + } + } + } +} + +} // namespace paddle From c7610106032f63a0dea4d87bca88a61fc21fe8e3 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 27 Jun 2017 13:32:06 +0800 Subject: [PATCH 17/20] Add unit test for im2col. --- paddle/function/CMakeLists.txt | 1 + paddle/function/Im2ColTest.cpp | 110 +++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 paddle/function/Im2ColTest.cpp diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 19f64eefd184e..178d1153f43ad 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -33,6 +33,7 @@ if(WITH_GPU) endif() add_simple_unittest(ConvOpTest) +add_simple_unittest(Im2ColTest) endif() add_style_check_target(paddle_function ${h_files}) diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp new file mode 100644 index 0000000000000..d7dbf087c5f9c --- /dev/null +++ b/paddle/function/Im2ColTest.cpp @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Im2Col.h" +#include +#include "Function.h" +#include "paddle/math/Matrix.h" +#include "paddle/math/tests/TensorCheck.h" + +namespace paddle { + +TEST(Im2ColFunctor, real) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t filterHeight : {1, 5}) { + for (size_t filterWidth : {3, 7}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + if (inputHeight <= filterHeight || inputWidth <= filterWidth) + break; + if (padding >= filterHeight || padding >= filterWidth) break; + size_t outputHeight = + (inputHeight - filterHeight + 2 * padding + stride) / + stride; + size_t outputWidth = + (inputWidth - filterWidth + 2 * padding + stride) / stride; + + TensorShape imShape = + TensorShape({channels, inputHeight, inputWidth}); + TensorShape colShape1 = TensorShape({channels, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + TensorShape colShape2 = TensorShape({outputHeight, + outputWidth, + channels, + filterHeight, + filterWidth}); + + VectorPtr input = Vector::create(imShape.getElements(), false); + size_t height = channels * filterHeight * filterWidth; + size_t width = outputHeight * outputWidth; + MatrixPtr output1 = Matrix::create(height, width, false, false); + MatrixPtr output2 = Matrix::create(width, height, false, false); + Im2ColFunctor im2col1; + Im2ColFunctor im2col2; + + input->uniform(0.001, 1); + im2col1(input->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding); + im2col2(input->getData(), + imShape, + output2->getData(), + colShape2, + stride, + stride, + padding, + padding); + + MatrixPtr test; + output2->transpose(test, true); + autotest::TensorCheckErr(*output1, *test); + } + } + } + } + } + } + } +} + +#if 0 +TEST(Col2ImFunctor, real) { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t filterHeight : {1, 5}) { + for (size_t filterWidth : {3, 7}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + } + } + } + } + } + } + } +} +#endif + +} // namespace paddle From a83d52151cbe6ed82b0b35eb21219442a8ac926a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 27 Jun 2017 17:34:24 +0800 Subject: [PATCH 18/20] Add unit test for Col2ImFunctor. --- paddle/function/Im2ColTest.cpp | 63 +++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index d7dbf087c5f9c..acc88a553abe7 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -20,7 +20,8 @@ limitations under the License. */ namespace paddle { -TEST(Im2ColFunctor, real) { +template +void TestIm2ColFunctor() { for (size_t channels : {1, 5, 32}) { for (size_t inputHeight : {5, 33, 100}) { for (size_t inputWidth : {5, 32, 96}) { @@ -50,16 +51,18 @@ TEST(Im2ColFunctor, real) { filterHeight, filterWidth}); - VectorPtr input = Vector::create(imShape.getElements(), false); size_t height = channels * filterHeight * filterWidth; size_t width = outputHeight * outputWidth; + VectorPtr input1 = Vector::create(imShape.getElements(), false); + VectorPtr input2 = Vector::create(imShape.getElements(), false); MatrixPtr output1 = Matrix::create(height, width, false, false); MatrixPtr output2 = Matrix::create(width, height, false, false); - Im2ColFunctor im2col1; - Im2ColFunctor im2col2; + input1->uniform(0.001, 1); + input2->copyFrom(*input1); - input->uniform(0.001, 1); - im2col1(input->getData(), + Im2ColFunctor im2Col1; + Im2ColFunctor im2Col2; + im2Col1(input1->getData(), imShape, output1->getData(), colShape1, @@ -67,7 +70,7 @@ TEST(Im2ColFunctor, real) { stride, padding, padding); - im2col2(input->getData(), + im2Col2(input2->getData(), imShape, output2->getData(), colShape2, @@ -76,27 +79,32 @@ TEST(Im2ColFunctor, real) { padding, padding); + // The transposition of the result of ColFormat == kCFO + // is equal to the result of ColFormat == kOCF. MatrixPtr test; output2->transpose(test, true); autotest::TensorCheckErr(*output1, *test); - } - } - } - } - } - } - } -} -#if 0 -TEST(Col2ImFunctor, real) { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { - for (size_t filterHeight : {1, 5}) { - for (size_t filterWidth : {3, 7}) { - for (size_t stride : {1, 2}) { - for (size_t padding : {0, 1}) { + Col2ImFunctor col2Im1; + Col2ImFunctor col2Im2; + col2Im1(input1->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding); + col2Im2(input2->getData(), + imShape, + output2->getData(), + colShape2, + stride, + stride, + padding, + padding); + + autotest::TensorCheckErr(*input1, *input2); } } } @@ -105,6 +113,13 @@ TEST(Col2ImFunctor, real) { } } } + +TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor(); } + +#ifndef PADDLE_ONLY_CPU + +TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor(); } + #endif } // namespace paddle From a7ff11404d097f759aaa2142458750631a9b7641 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 27 Jun 2017 17:53:31 +0800 Subject: [PATCH 19/20] Change the ImageFunction name to BlockFunction(Consistent with the name of Layer). --- paddle/function/ImageExpandOp.cpp | 18 +++++++++--------- paddle/function/ImageExpandOpTest.cpp | 8 ++++---- paddle/gserver/layers/BlockExpandLayer.cpp | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/ImageExpandOp.cpp index 00a2571936b39..a89b6bba45843 100644 --- a/paddle/function/ImageExpandOp.cpp +++ b/paddle/function/ImageExpandOp.cpp @@ -32,7 +32,7 @@ namespace paddle { * \param inputs[0] Sequence data of NST format. * \param outputs[0] Image data of NCHW format. */ -class ImageExpandFunction : public FunctionBase { +class BlockExpandFunction : public FunctionBase { public: void init(const FuncConfig& config) override { // function arguments @@ -100,10 +100,10 @@ class ImageExpandFunction : public FunctionBase { }; template -class ImageExpandForward : public ImageExpandFunction { +class BlockExpandForward : public BlockExpandFunction { public: void init(const FuncConfig& config) override { - ImageExpandFunction::init(config); + BlockExpandFunction::init(config); } void check(const BufferArgs& inputs, const BufferArgs& outputs) override { @@ -148,10 +148,10 @@ class ImageExpandForward : public ImageExpandFunction { }; template -class ImageExpandBackward : public ImageExpandFunction { +class BlockExpandBackward : public BlockExpandFunction { public: void init(const FuncConfig& config) override { - ImageExpandFunction::init(config); + BlockExpandFunction::init(config); } void check(const BufferArgs& inputs, const BufferArgs& outputs) override { @@ -192,11 +192,11 @@ class ImageExpandBackward : public ImageExpandFunction { } }; -REGISTER_TYPED_FUNC(ImageExpand, CPU, ImageExpandForward); -REGISTER_TYPED_FUNC(ImageExpandGrad, CPU, ImageExpandBackward); +REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward); +REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward); #ifndef PADDLE_ONLY_CPU -REGISTER_TYPED_FUNC(ImageExpand, GPU, ImageExpandForward); -REGISTER_TYPED_FUNC(ImageExpandGrad, GPU, ImageExpandBackward); +REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward); +REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward); #endif } // namespace paddle diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/ImageExpandOpTest.cpp index fb312549dc758..5e4897e72ba9f 100644 --- a/paddle/function/ImageExpandOpTest.cpp +++ b/paddle/function/ImageExpandOpTest.cpp @@ -17,7 +17,7 @@ limitations under the License. */ namespace paddle { -TEST(ImageExpandForward, real) { +TEST(BlockExpandForward, real) { for (size_t batchSize : {5, 32}) { for (size_t channels : {1, 5, 32}) { for (size_t inputHeight : {5, 33, 100}) { @@ -29,7 +29,7 @@ TEST(ImageExpandForward, real) { std::vector strides = {stride, stride}; std::vector paddings = {padding, padding}; std::vector blocks = {block, block}; - CpuGpuFuncCompare test("ImageExpand", + CpuGpuFuncCompare test("BlockExpand", FuncConfig() .set("strides", strides) .set("paddings", paddings) @@ -60,7 +60,7 @@ TEST(ImageExpandForward, real) { } } -TEST(ImageExpandBackward, real) { +TEST(BlockExpandBackward, real) { for (size_t batchSize : {5, 32}) { for (size_t channels : {1, 5, 32}) { for (size_t inputHeight : {5, 33, 100}) { @@ -72,7 +72,7 @@ TEST(ImageExpandBackward, real) { std::vector strides = {stride, stride}; std::vector paddings = {padding, padding}; std::vector blocks = {block, block}; - CpuGpuFuncCompare test("ImageExpandGrad", + CpuGpuFuncCompare test("BlockExpandGrad", FuncConfig() .set("strides", strides) .set("paddings", paddings) diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index adc9a814ffbb1..3b1f346359172 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -41,13 +41,13 @@ bool BlockExpandLayer::init(const LayerMap& layerMap, std::vector paddings = {(size_t)paddingH_, (size_t)paddingW_}; std::vector blocks = {(size_t)blockH_, (size_t)blockW_}; createFunction(forward_, - "ImageExpand", + "BlockExpand", FuncConfig() .set("strides", strides) .set("paddings", paddings) .set("blocks", blocks)); createFunction(backward_, - "ImageExpandGrad", + "BlockExpandGrad", FuncConfig() .set("strides", strides) .set("paddings", paddings) From 7a550f90d8a7a1aea81f300d127f3aef975f8693 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 27 Jun 2017 18:05:14 +0800 Subject: [PATCH 20/20] Fix the function file name. --- paddle/function/{ImageExpandOp.cpp => BlockExpandOp.cpp} | 0 .../function/{ImageExpandOpTest.cpp => BlockExpandOpTest.cpp} | 0 paddle/function/CMakeLists.txt | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename paddle/function/{ImageExpandOp.cpp => BlockExpandOp.cpp} (100%) rename paddle/function/{ImageExpandOpTest.cpp => BlockExpandOpTest.cpp} (100%) diff --git a/paddle/function/ImageExpandOp.cpp b/paddle/function/BlockExpandOp.cpp similarity index 100% rename from paddle/function/ImageExpandOp.cpp rename to paddle/function/BlockExpandOp.cpp diff --git a/paddle/function/ImageExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp similarity index 100% rename from paddle/function/ImageExpandOpTest.cpp rename to paddle/function/BlockExpandOpTest.cpp diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 178d1153f43ad..bef4d2955b3c9 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -29,7 +29,7 @@ if(WITH_GPU) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) add_simple_unittest(RowConvOpTest) - add_simple_unittest(ImageExpandOpTest) + add_simple_unittest(BlockExpandOpTest) endif() add_simple_unittest(ConvOpTest)