From 3b65bc7a2631fe2326149cd228d2ecde3a86081b Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 14:17:25 +0800 Subject: [PATCH 01/24] Add a naive convolution implement --- paddle/function/ConvOp.cpp | 128 +++++++++++++++++++++++++++++++++++++ paddle/function/ConvOp.h | 67 +++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 paddle/function/ConvOp.cpp create mode 100644 paddle/function/ConvOp.h diff --git a/paddle/function/ConvOp.cpp b/paddle/function/ConvOp.cpp new file mode 100644 index 0000000000000..50f030585a0b5 --- /dev/null +++ b/paddle/function/ConvOp.cpp @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ConvFunc.h" + +namespace paddle { + +/* + * The three arguments are stored in memory in row major order. + * inputData = [batchSize, inputChannels, inputHeight, inputWidth] + * filterData = [outputChannels, inputChannels, filterHeight, filterWidth] + * outputData = [batchSize, outputChannels, outputHeight, outputWidth] + */ +template +class NaiveConvFunctor { +public: + void operator()(const T* inputData, + size_t batchSize, + size_t inputChannels, + size_t inputHeight, + size_t inputWidth, + const T* filterData, + size_t filterHeight, + size_t filterWidth, + T* outputData, + size_t outputChannels, + size_t outputHeight, + size_t outputWidth, + size_t padding, + size_t stride) { + for (size_t batch = 0; batch < batchSize; batch++) { + for (size_t outC = 0; outC < outputChannels; outC++) { + for (size_t outH = 0; outH < outputHeight; outH++) { + for (size_t outW = 0; outW < outputWidth; outW++) { + const int inStartH = (outH * stride) - padding; + const int inStartW = (outW * stride) - padding; + T outValue = (T)0; + for (size_t inC = 0; inC < inputChannels; inC++) { + for (size_t fH = 0; fH < filterHeight; fH++) { + for (size_t fW = 0; fW < filterWidth; fW++) { + T inValue; + const int inH = inStartH + fH; + const int inW = inStartW + fW; + if ((inH >= 0 && inH < inputHeight) && + (inW >= 0 && inW < inputWidth)) { + size_t offsetInput = + batch * inputChannels * inputHeight * inputWidth + + inC * inputHeight * inputWidth + inH * inputWidth + inW; + inValue = inputData[offsetInput]; + } else { + inValue = (T)0; + } + size_t offsetFilter = + outC * inputChannels * filterHeight * filterWidth + + inC * filterHeight * filterWidth + fH * filterWidth + fW; + T filterValue = filterData[offsetFilter]; + outValue += (inValue * filterValue); + } + } + } + + size_t offset = + batch * outputChannels * outputHeight * outputWidth + + outC * outputHeight * outputWidth + outH * outputWidth + outW; + outputData[offset] = outValue; + } + } + } + } + } +}; + +template +class NaiveConvFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + check(inputs, outputs); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + + size_t batchSize = inputs[0].shape()[0]; + size_t inputChannels = inputs[0].shape()[1]; + size_t inputHeight = inputs[0].shape()[2]; + size_t inputWidth = inputs[0].shape()[3]; + size_t filterHeight = inputs[1].shape()[2]; + size_t filterWidth = inputs[1].shape()[2]; + size_t outputChannels = outputs[0].shape()[1]; + size_t outputHeight = outputs[0].shape()[2]; + size_t outputWidth = outputs[0].shape()[3]; + + float* inputData = inputs[0].data(); + float* filterData = inputs[1].data(); + float* outputData = outputs[0].data(); + NaiveConvFunctor conv; + conv(inputData, + batchSize, + inputChannels, + inputHeight, + inputWidth, + filterData, + filterHeight, + filterWidth, + outputData, + outputChannels, + outputHeight, + outputWidth, + padding_, + stride_); + } +}; + +REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction); + +} // namespace paddle diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h new file mode 100644 index 0000000000000..4d678cfe2735c --- /dev/null +++ b/paddle/function/ConvOp.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Function.h" + +namespace paddle { + +/* + * Function Arguments: + * + * \param inputs[0] Input image data, is NCHW format, where N is batch size, + * C is the number of channels, H and W is the height and + * width of input image. + * \param inputs[1] Filter data, is MCHW, where M is the number of output + * channels, C is the number of input channels, H and W + * is height and width of filter. + * \param outputs[0] Output image data, is NCHW format, where N is batch size, + * C is the number of channels, H and W is the height and + * width of output image. + * + * \note Implemented based on the ConvFunctionBase class only supports + * input data in the NCHW format. + */ +class ConvFunctionBase : public FunctionBase { +public: + void init(const FuncConfig& config) override { + // function arguments + stride_ = config.get("stride"); + padding_ = config.get("padding"); + + // number of inputs and outputs + numInputs_ = 2; + numOutputs_ = 1; + } + + virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {} + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + + CHECK_EQ(inputs[0].shape().ndims(), (size_t)4); + CHECK_EQ(inputs[1].shape().ndims(), (size_t)4); + CHECK_EQ(outputs[0].shape().ndims(), (size_t)4); + + CHECK(inputs[0].shape()[0] == outputs[0].shape()[0]); + CHECK(inputs[0].shape()[1] == inputs[1].shape()[1]); + CHECK(outputs[0].shape()[1] == inputs[1].shape()[0]); + } + +protected: + size_t padding_; + size_t stride_; +}; + +} // namespace paddle From b6de52c47bbd0a5f5d943364a8ecfcec3df7da6a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 14:50:01 +0800 Subject: [PATCH 02/24] Bug fix --- paddle/function/{ConvOp.cpp => NaiveConvOp.cpp} | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) rename paddle/function/{ConvOp.cpp => NaiveConvOp.cpp} (95%) diff --git a/paddle/function/ConvOp.cpp b/paddle/function/NaiveConvOp.cpp similarity index 95% rename from paddle/function/ConvOp.cpp rename to paddle/function/NaiveConvOp.cpp index 50f030585a0b5..f13aa880a1e88 100644 --- a/paddle/function/ConvOp.cpp +++ b/paddle/function/NaiveConvOp.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "ConvFunc.h" +#include "ConvOp.h" namespace paddle { @@ -102,10 +102,10 @@ class NaiveConvFunction : public ConvFunctionBase { size_t outputHeight = outputs[0].shape()[2]; size_t outputWidth = outputs[0].shape()[3]; - float* inputData = inputs[0].data(); - float* filterData = inputs[1].data(); - float* outputData = outputs[0].data(); - NaiveConvFunctor conv; + real* inputData = inputs[0].data(); + real* filterData = inputs[1].data(); + real* outputData = outputs[0].data(); + NaiveConvFunctor conv; conv(inputData, batchSize, inputChannels, From 1846d9e1725465fcde1bc4e54384036edd483d80 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 15:12:54 +0800 Subject: [PATCH 03/24] Add a convolution Function based on gemm. --- paddle/function/GemmConvOp.cpp | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 paddle/function/GemmConvOp.cpp diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp new file mode 100644 index 0000000000000..5e6ee24486213 --- /dev/null +++ b/paddle/function/GemmConvOp.cpp @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ConvOp.h" +#include "paddle/math/MathFunctions.h" +#include "paddle/math/MemoryHandle.h" + +namespace paddle { + +/* + * imData = [input_channels, input_height, input_width] + * colData = [input_channels, filter_height, filter_width, + * output_height, output_width] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* colData) { + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterHeight / filterWidth; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + // no c_im*height to Exclude the channel number + int imgRowIdx = h * strideHeight + hOffset; + int imgColIdx = w * strideWidth + wOffset; + if ((imgRowIdx - paddingHeight) < 0 || + (imgRowIdx - paddingHeight) >= inputHeight || + (imgColIdx - paddingWidth) < 0 || + (imgColIdx - paddingWidth) >= inputWidth) { + colData[(c * outputHeight + h) * outputWidth + w] = T(0); + } else { + imgRowIdx += c_im * inputHeight - paddingHeight; + imgColIdx -= paddingWidth; + colData[(c * outputHeight + h) * outputWidth + w] = + imData[imgRowIdx * inputWidth + imgColIdx]; + } + } + } + } + } +}; + +/* + * Function Arguments: + * + * \param inputs[0] Input image data, is NCHW format, where N is batch size, + * C is the number of channels, H and W is the height and + * width of input image. + * \param inputs[1] Filter data, is MCHW, where M is the number of output + * channels, C is the number of input channels, H and W + * is height and width of filter. + * \param outputs[0] Output image data, is NCHW format, where N is batch size, + * C is the number of channels, H and W is the height and + * width of output image. + */ +template +class GemmConvFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + check(inputs, outputs); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + + size_t batchSize = inputs[0].shape()[0]; + size_t inputChannels = inputs[0].shape()[1]; + size_t inputHeight = inputs[0].shape()[2]; + size_t inputWidth = inputs[0].shape()[3]; + size_t filterHeight = inputs[1].shape()[2]; + size_t filterWidth = inputs[1].shape()[2]; + size_t outputChannels = outputs[0].shape()[1]; + size_t outputHeight = outputs[0].shape()[2]; + size_t outputWidth = outputs[0].shape()[3]; + + real* inputData = inputs[0].data(); + real* filterData = inputs[1].data(); + real* outputData = outputs[0].data(); + + size_t size = + inputChannels * filterHeight * filterWidth * outputHeight * outputWidth; + resizeBuffer(size); + real* colData = reinterpret_cast(memory_->getBuf()); + + Im2ColFunctor im2col; + for (size_t i = 0; i < batchSize; i++) { + im2col(inputData, + inputChannels, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + stride_, + stride_, + padding_, + padding_, + outputHeight, + outputWidth, + colData); + + int M = outputChannels; + int N = outputHeight * outputWidth; + int K = inputChannels * filterHeight * filterWidth; + gemm(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + 1.0f, + filterData, + K, + colData, + N, + 0.0f, + outputData, + N); + inputData += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } + + void resizeBuffer(size_t newSize) { + if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) { + memory_ = std::make_shared(newSize * sizeof(real)); + } + } + +private: + CpuMemHandlePtr memory_; +}; + +REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); + +} // namespace paddle From 1879332a30e0d403a0a36c0f0b834c0b8070cef4 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 20:01:04 +0800 Subject: [PATCH 04/24] Modify FunctionCompare to Compare2Function to support comparison of two CPU functions. --- paddle/function/ContextProjectionOpTest.cpp | 4 +- paddle/function/CosSimOpTest.cpp | 4 +- paddle/function/CrossMapNormalOpTest.cpp | 20 +- paddle/function/FunctionTest.h | 278 ++++++++++++-------- paddle/function/MulOpTest.cpp | 14 +- paddle/function/PadOpTest.cpp | 2 +- 6 files changed, 187 insertions(+), 135 deletions(-) diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp index 1b25172ca5c0c..9e9dd20e6f3ab 100644 --- a/paddle/function/ContextProjectionOpTest.cpp +++ b/paddle/function/ContextProjectionOpTest.cpp @@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start, std::max(0, (int)(context_start + context_length - 1)); if (pad == 0) is_padding = false; - FunctionCompare test( + CpuGpuFuncCompare test( "ContextProjectionForward", FuncConfig() .set("context_length", context_length) @@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start, std::max(0, (int)(context_start + context_length - 1)); if (pad == 0) is_padding = false; - FunctionCompare test( + CpuGpuFuncCompare test( "ContextProjectionBackward", FuncConfig() .set("context_length", context_length) diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp index 48c815f027161..f6c0041101f50 100644 --- a/paddle/function/CosSimOpTest.cpp +++ b/paddle/function/CosSimOpTest.cpp @@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x, size_t height_y, size_t width, real scale) { - FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale)); + CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale)); // prepare input arguments test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width})); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width})); @@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x, size_t height_y, size_t width, real scale) { - FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale)); + CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale)); // prepare input arguments test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1})); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1})); diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp index 51f5da81bfc9a..ed17b17da616d 100644 --- a/paddle/function/CrossMapNormalOpTest.cpp +++ b/paddle/function/CrossMapNormalOpTest.cpp @@ -28,11 +28,11 @@ TEST(CrossMapNormal, real) { << " size=" << size; // init Test object - FunctionCompare test("CrossMapNormal", - FuncConfig() - .set("size", size) - .set("scale", (real)1.5) - .set("pow", (real)0.5)); + CpuGpuFuncCompare test("CrossMapNormal", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); // prepare input arguments TensorShape shape{numSamples, channels, imgSizeH, imgSizeW}; test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); @@ -57,11 +57,11 @@ TEST(CrossMapNormalGrad, real) { << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; - FunctionCompare test("CrossMapNormalGrad", - FuncConfig() - .set("size", size) - .set("scale", (real)1.5) - .set("pow", (real)0.5)); + CpuGpuFuncCompare test("CrossMapNormalGrad", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); TensorShape shape{numSamples, channels, imgSizeH, imgSizeW}; test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h index 0cfafdb27f55a..69ab33052dadc 100644 --- a/paddle/function/FunctionTest.h +++ b/paddle/function/FunctionTest.h @@ -22,14 +22,62 @@ namespace paddle { typedef std::shared_ptr BufferArgPtr; +namespace test { +template +struct Allocator; + +template <> +struct Allocator { + using type = CpuMemoryHandle; +}; + +template <> +struct Allocator { + using type = GpuMemoryHandle; +}; + +// Copy argument1 to argument2 +template +class CopyArgument { +public: + void operator()(const BufferArg& arg1, BufferArg& arg2) { + CHECK_EQ(arg1.valueType(), arg2.valueType()); + CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements()); + + if (arg1.valueType() == VALUE_TYPE_INT32) { + IVectorPtr vector1 = + IVector::create((int*)arg1.data(), + arg1.shape().getElements(), + DType1 == DEVICE_TYPE_CPU ? false : true); + IVectorPtr vector2 = + IVector::create((int*)arg2.data(), + arg2.shape().getElements(), + DType2 == DEVICE_TYPE_CPU ? false : true); + vector2->copyFrom(*vector1); + } else { + VectorPtr vector1 = + Vector::create((real*)arg1.data(), + arg1.shape().getElements(), + DType1 == DEVICE_TYPE_CPU ? false : true); + VectorPtr vector2 = + Vector::create((real*)arg2.data(), + arg2.shape().getElements(), + DType2 == DEVICE_TYPE_CPU ? false : true); + vector2->copyFrom(*vector1); + } + } +}; +} // namespace test + /** - * \brief A class for comparing CPU and GPU implementations of Function. - * + * \brief A class for comparing two Functions of different implementations. + * For example, can be used to compare the CPU and GPU implementation + * of the function is consistent. * * Use case: * // Initializes a test object, the corresponding cpu and gpu Function * // are constructed according to FunctionName and FuncConfig. - * FunctionCompare test(FunctionName, FuncConfig); + * CpuGpuFuncCompare test(FunctionName, FuncConfig); * // Prepare inputs and outputs arguments. * // Here the input and output can not contain real data, * // only contains the argument type and shape. @@ -45,28 +93,38 @@ typedef std::shared_ptr BufferArgPtr; * // Compares CPU and GPU calculation results for consistency. * test.run(); */ -class FunctionCompare { +template +class Compare2Function { public: - FunctionCompare(const std::string& name, const FuncConfig& config) - : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")), - gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) { - cpuFunc_->init(config); - gpuFunc_->init(config); + typedef typename test::Allocator::type Allocator1; + typedef typename test::Allocator::type Allocator2; + typedef typename Tensor::Vector Vector1; + typedef typename Tensor::Vector Vector2; + typedef typename Tensor::SparseMatrix SparseMatrix1; + typedef typename Tensor::SparseMatrix SparseMatrix2; + + Compare2Function(const std::string& name1, + const std::string& name2, + const FuncConfig& config) + : function1_(FunctionBase::funcRegistrar_.createByType(name1)), + function2_(FunctionBase::funcRegistrar_.createByType(name2)) { + function1_->init(config); + function2_->init(config); } - ~FunctionCompare() {} + ~Compare2Function() {} // input need only contains shape, do not contains data. void addInputs(const BufferArg& input) { size_t size = input.shape().getElements() * sizeOfValuType(input.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); - cpuInputs_.emplace_back(std::make_shared( - cpuMemory_.back()->getBuf(), input.valueType(), input.shape())); - gpuInputs_.emplace_back(std::make_shared( - gpuMemory_.back()->getBuf(), input.valueType(), input.shape())); + func1Inputs_.emplace_back(std::make_shared( + func1Memory_.back()->getBuf(), input.valueType(), input.shape())); + func2Inputs_.emplace_back(std::make_shared( + func2Memory_.back()->getBuf(), input.valueType(), input.shape())); } // assume one copy of sequence is shared by different SequenceArgs @@ -75,62 +133,57 @@ class FunctionCompare { size_t batchSize = input.shape()[0]; size_t numSeqs = batchSize / 10 + 1; size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32); - cpuMemory_.emplace_back(std::make_shared(sizeId)); - gpuMemory_.emplace_back(std::make_shared(sizeId)); - cpuSeq_ = std::make_shared(cpuMemory_.back()->getBuf(), - TensorShape{numSeqs + 1}); - gpuSeq_ = std::make_shared(gpuMemory_.back()->getBuf(), - TensorShape{numSeqs + 1}); + func1Memory_.emplace_back(std::make_shared(sizeId)); + func2Memory_.emplace_back(std::make_shared(sizeId)); + seq1_ = std::make_shared(func1Memory_.back()->getBuf(), + TensorShape{numSeqs + 1}); + seq2_ = std::make_shared(func2Memory_.back()->getBuf(), + TensorShape{numSeqs + 1}); /// init sequence Id - initArg(*cpuSeq_, batchSize); + initArg(*seq1_, batchSize); - // todo(tianbing), delete it - CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1); - - CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data()); - GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data()); - gpuSeq.copyFrom(cpuSeq); + copyArg_(*seq1_, *seq2_); } void addInputs(const SequenceArg& input) { CHECK_EQ(input.shape().ndims(), 2UL); size_t batchSize = input.shape()[0]; - if (!cpuSeq_ || !gpuSeq_) { // sequence not exist + if (!seq1_ || !seq2_) { // sequence not exist addSequence(SequenceIdArg(TensorShape{batchSize})); } size_t size = input.shape().getElements() * sizeOfValuType(input.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); /// SequenceArg - cpuInputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Inputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), input.valueType(), input.shape(), - *cpuSeq_)); - gpuInputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + *seq1_)); + func2Inputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), input.valueType(), input.shape(), - *gpuSeq_)); + *seq2_)); } // output need only contains shape, do not contains data. void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) { size_t size = output.shape().getElements() * sizeOfValuType(output.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); - cpuOutputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Outputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), output.valueType(), output.shape(), argType)); - gpuOutputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + func2Outputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), output.valueType(), output.shape(), argType)); @@ -138,14 +191,14 @@ class FunctionCompare { /// add and init output sparse matrix void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) { - cpuSparse_ = std::make_shared( + sparse1_ = std::make_shared( output.shape()[0], output.shape()[1], output.nnz(), static_cast(output.dataType()), static_cast(output.dataFormat())); - gpuSparse_ = std::make_shared( + sparse2_ = std::make_shared( output.shape()[0], output.shape()[1], output.nnz(), @@ -154,52 +207,52 @@ class FunctionCompare { /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); - cpuSparse_->randomizeUniform(); - gpuSparse_->copyFrom(*cpuSparse_, stream); + sparse1_->randomizeUniform(); + sparse2_->copyFrom(*sparse1_, stream); hl_stream_synchronize(stream); - cpuOutputs_.emplace_back( - std::make_shared(*cpuSparse_, argType)); - gpuOutputs_.emplace_back( - std::make_shared(*gpuSparse_, argType)); + func1Outputs_.emplace_back( + std::make_shared(*sparse1_, argType)); + func2Outputs_.emplace_back( + std::make_shared(*sparse2_, argType)); } void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) { CHECK_EQ(output.shape().ndims(), 2UL); size_t batchSize = output.shape()[0]; - if (!cpuSeq_ || !gpuSeq_) { // sequence not exist + if (!seq1_ || !seq2_) { // sequence not exist addSequence(SequenceIdArg(TensorShape{batchSize})); } size_t size = output.shape().getElements() * sizeOfValuType(output.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); /// SequenceArg - cpuOutputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Outputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), output.valueType(), output.shape(), - *cpuSeq_, + *seq1_, argType)); - gpuOutputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + func2Outputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), output.valueType(), output.shape(), - *gpuSeq_, + *seq2_, argType)); } void addInputs(const SparseMatrixArg& input) { - cpuSparse_ = std::make_shared( + sparse1_ = std::make_shared( input.shape()[0], input.shape()[1], input.nnz(), static_cast(input.dataType()), static_cast(input.dataFormat())); - gpuSparse_ = std::make_shared( + sparse2_ = std::make_shared( input.shape()[0], input.shape()[1], input.nnz(), @@ -208,12 +261,12 @@ class FunctionCompare { /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); - cpuSparse_->randomizeUniform(); - gpuSparse_->copyFrom(*cpuSparse_, stream); + sparse1_->randomizeUniform(); + sparse2_->copyFrom(*sparse1_, stream); hl_stream_synchronize(stream); - cpuInputs_.emplace_back(std::make_shared(*cpuSparse_)); - gpuInputs_.emplace_back(std::make_shared(*gpuSparse_)); + func1Inputs_.emplace_back(std::make_shared(*sparse1_)); + func2Inputs_.emplace_back(std::make_shared(*sparse2_)); } void run() { @@ -236,27 +289,27 @@ class FunctionCompare { function->calc(inArgs, outArgs); }; - callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_); - callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_); + callFunction(function1_.get(), func1Inputs_, func1Outputs_); + callFunction(function2_.get(), func2Inputs_, func2Outputs_); // check outputs compareOutputs(); } - std::shared_ptr getCpuFunction() const { return cpuFunc_; } + std::shared_ptr getCpuFunction() const { return function1_; } - std::shared_ptr getGpuFunction() const { return gpuFunc_; } + std::shared_ptr getGpuFunction() const { return function2_; } protected: // only init cpu argument, gpu argument copy from cpu argument. void initArg(BufferArg& arg) { - CpuVector vector(arg.shape().getElements(), (real*)arg.data()); + Vector1 vector(arg.shape().getElements(), (real*)arg.data()); vector.uniform(0.001, 1); } void initArg(SequenceArg& arg) { /// init only matrix - CpuVector vector(arg.shape().getElements(), (real*)arg.data()); + Vector1 vector(arg.shape().getElements(), (real*)arg.data()); vector.uniform(0.001, 1); } @@ -276,73 +329,72 @@ class FunctionCompare { } void initInputs() { - for (size_t i = 0; i < cpuInputs_.size(); i++) { - if (cpuInputs_[i]->isSparseArg()) { + for (size_t i = 0; i < func1Inputs_.size(); i++) { + if (func1Inputs_[i]->isSparseArg()) { continue; /// sparse matrix already init } - if (cpuInputs_[i]->isSequenceArg()) { - initArg(dynamic_cast(*cpuInputs_[i])); + if (func1Inputs_[i]->isSequenceArg()) { + initArg(dynamic_cast(*func1Inputs_[i])); } else { - initArg(*cpuInputs_[i]); + initArg(*func1Inputs_[i]); } - // TODO: Need a BufferCopy used to copy from one BufferArg to another. - CpuVector cpuVector(cpuInputs_[i]->shape().getElements(), - (real*)cpuInputs_[i]->data()); - GpuVector gpuVector(gpuInputs_[i]->shape().getElements(), - (real*)gpuInputs_[i]->data()); - gpuVector.copyFrom(cpuVector); + copyArg_(*func1Inputs_[i], *func2Inputs_[i]); } } void initOutputs() { - for (size_t i = 0; i < cpuOutputs_.size(); i++) { - if (cpuOutputs_[i]->isSparseArg()) { + for (size_t i = 0; i < func1Outputs_.size(); i++) { + if (func1Outputs_[i]->isSparseArg()) { continue; /// sparse matrix already init } - if (cpuOutputs_[i]->isSequenceArg()) { - initArg(dynamic_cast(*cpuOutputs_[i])); + if (func1Outputs_[i]->isSequenceArg()) { + initArg(dynamic_cast(*func1Outputs_[i])); } else { - initArg(*cpuOutputs_[i]); + initArg(*func1Outputs_[i]); } - // TODO: Need a BufferCopy used to copy from one BufferArg to another. - CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(), - (real*)cpuOutputs_[i]->data()); - GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(), - (real*)gpuOutputs_[i]->data()); - - gpuVector.copyFrom(cpuVector); + copyArg_(*func1Outputs_[i], *func2Outputs_[i]); } } void compareOutputs() { - for (size_t i = 0; i < cpuOutputs_.size(); i++) { + for (size_t i = 0; i < func1Outputs_.size(); i++) { // TODO, Need a BufferCheck used to compare the two buffers. - const auto cpu = cpuOutputs_[i]; - const auto gpu = gpuOutputs_[i]; + const auto cpu = func1Outputs_[i]; + const auto gpu = func2Outputs_[i]; CHECK_EQ(cpu->numElements(), gpu->numElements()); - CpuVector cpuVector(cpu->numElements(), (real*)cpu->data()); - GpuVector gpuVector(gpu->numElements(), (real*)gpu->data()); + Vector1 cpuVector(cpu->numElements(), (real*)cpu->data()); + Vector2 gpuVector(gpu->numElements(), (real*)gpu->data()); autotest::TensorCheckErr(cpuVector, gpuVector); } } protected: - std::shared_ptr cpuFunc_; - std::shared_ptr gpuFunc_; - std::vector cpuMemory_; - std::vector gpuMemory_; - std::vector cpuInputs_; - std::vector cpuOutputs_; - std::vector gpuInputs_; - std::vector gpuOutputs_; - std::shared_ptr cpuSparse_; - std::shared_ptr gpuSparse_; - std::shared_ptr cpuSeq_; - std::shared_ptr gpuSeq_; + std::shared_ptr function1_; + std::shared_ptr function2_; + std::vector> func1Memory_; + std::vector> func2Memory_; + std::vector func1Inputs_; + std::vector func1Outputs_; + std::vector func2Inputs_; + std::vector func2Outputs_; + std::shared_ptr sparse1_; + std::shared_ptr sparse2_; + std::shared_ptr seq1_; + std::shared_ptr seq2_; + test::CopyArgument copyArg_; +}; + +class CpuGpuFuncCompare + : public Compare2Function { +public: + CpuGpuFuncCompare(const std::string& name, const FuncConfig& config) + : Compare2Function(name + "-CPU", name + "-GPU", config) {} + + ~CpuGpuFuncCompare() {} }; } // namespace paddle diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp index 8753057ebf73c..d31eb0c74f25f 100644 --- a/paddle/function/MulOpTest.cpp +++ b/paddle/function/MulOpTest.cpp @@ -35,7 +35,7 @@ void testFuncDDDMatrix( size_t heightC = dimM; size_t widthC = dimN; // init Test object - FunctionCompare test( + CpuGpuFuncCompare test( "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb)); // prepare input arguments /// matrix A : HA * WA @@ -81,8 +81,8 @@ void testFuncDSparseDMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// sparse matrix A : M * K test.addInputs(SparseMatrixArg( @@ -126,8 +126,8 @@ void testFuncDDSparseMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); @@ -172,8 +172,8 @@ void testFuncSparseDDMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp index f77ac2a8c49c8..e286f4e5b8a42 100644 --- a/paddle/function/PadOpTest.cpp +++ b/paddle/function/PadOpTest.cpp @@ -25,7 +25,7 @@ TEST(Pad, real) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; for (bool test_grad : {false, true}) { - FunctionCompare compare( + CpuGpuFuncCompare compare( test_grad ? "PadGrad" : "Pad", FuncConfig() .set>("channel", {2, 3}) From 455888c50964f86f6f434380ae9d17d7e7f4a454 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 20:09:11 +0800 Subject: [PATCH 05/24] Add ConvOpTest for NaiveConv and GemmConv --- paddle/function/CMakeLists.txt | 4 +- paddle/function/ConvOpTest.cpp | 82 ++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 paddle/function/ConvOpTest.cpp diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 233a53709a80f..472a43fa485fe 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -14,8 +14,8 @@ add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) add_dependencies(paddle_function ${external_project_dependencies}) add_dependencies(paddle_function gen_proto_cpp) -if(WITH_GPU) if(WITH_TESTING) +if(WITH_GPU) # TODO: # file(GLOB test_files . *OpTest.cpp) # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files}) @@ -29,6 +29,8 @@ if(WITH_TESTING) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) endif() + +add_simple_unittest(ConvOpTest) endif() add_style_check_target(paddle_function ${h_files}) diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp new file mode 100644 index 0000000000000..715fa58b5597c --- /dev/null +++ b/paddle/function/ConvOpTest.cpp @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "Function.h" +#include "FunctionTest.h" + +namespace paddle { + +typedef Compare2Function Compare2CpuFunction; + +class ConvolutionTest { +public: + ConvolutionTest(const std::string& conv1, + const std::string& conv2, + std::string algo = "auto") { + for (size_t batchSize : {1, 32}) { + for (size_t inputSize : {7, 14, 54}) { + for (size_t filterSize : {1, 3, 5}) { + for (size_t inputChannels : {3, 64}) { + for (size_t outputChannels : {3, 64, 128}) { + if (inputChannels < outputChannels) break; + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + if (padding >= filterSize) break; + size_t outputSize = + (inputSize - filterSize + 2 * padding + stride) / stride; + LOG(INFO) << " batchSize=" << batchSize + << " inputChannels=" << inputChannels + << " inputHeight=" << inputSize + << " inputWidth=" << inputSize + << " outputChannels=" << outputChannels + << " filterHeight=" << filterSize + << " filterWidth=" << filterSize + << " outputHeight=" << outputSize + << " outputWidth=" << outputSize + << " stride=" << stride << " padding=" << padding; + + Compare2CpuFunction test(conv1, + conv2, + FuncConfig() + .set("padding", padding) + .set("stride", stride) + .set("algo", algo)); + + TensorShape shape0{ + batchSize, inputChannels, inputSize, inputSize}; + TensorShape shape1{ + outputChannels, inputChannels, filterSize, filterSize}; + TensorShape shape2{ + batchSize, outputChannels, outputSize, outputSize}; + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2)); + test.run(); + } + } + } + } + } + } + } + } +}; + +TEST(Convolution, GEMM) { + ConvolutionTest test("NaiveConv-CPU", "GemmConv-CPU"); +} + +} // namespace paddle From 048b14a9164d62c9b80138bb291d74f995478ddb Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 26 May 2017 21:28:59 +0800 Subject: [PATCH 06/24] Change stride to strides, and change padding to paddings. --- paddle/function/ConvOp.h | 15 +++++++++++---- paddle/function/ConvOpTest.cpp | 6 ++++-- paddle/function/GemmConvOp.cpp | 8 ++++---- paddle/function/NaiveConvOp.cpp | 16 ++++++++++------ 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 4d678cfe2735c..465db57ae7d82 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -36,8 +36,8 @@ class ConvFunctionBase : public FunctionBase { public: void init(const FuncConfig& config) override { // function arguments - stride_ = config.get("stride"); - padding_ = config.get("padding"); + strides_ = config.get>("strides"); + paddings_ = config.get>("paddings"); // number of inputs and outputs numInputs_ = 2; @@ -60,8 +60,15 @@ class ConvFunctionBase : public FunctionBase { } protected: - size_t padding_; - size_t stride_; + std::vector strides_; + std::vector paddings_; + inline int strideH() const { return strides_[0]; } + + inline int strideW() const { return strides_[1]; } + + inline int paddingH() const { return paddings_[0]; } + + inline int paddingW() const { return paddings_[1]; } }; } // namespace paddle diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index 715fa58b5597c..db8d9fa9da460 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -48,11 +48,13 @@ class ConvolutionTest { << " outputWidth=" << outputSize << " stride=" << stride << " padding=" << padding; + std::vector paddings = {padding, padding}; + std::vector strides = {stride, stride}; Compare2CpuFunction test(conv1, conv2, FuncConfig() - .set("padding", padding) - .set("stride", stride) + .set("paddings", paddings) + .set("strides", strides) .set("algo", algo)); TensorShape shape0{ diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 5e6ee24486213..42786e44e0e97 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -118,10 +118,10 @@ class GemmConvFunction : public ConvFunctionBase { inputWidth, filterHeight, filterWidth, - stride_, - stride_, - padding_, - padding_, + strideH(), + strideW(), + paddingH(), + paddingW(), outputHeight, outputWidth, colData); diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp index f13aa880a1e88..f5d2aa16ab9b8 100644 --- a/paddle/function/NaiveConvOp.cpp +++ b/paddle/function/NaiveConvOp.cpp @@ -37,14 +37,16 @@ class NaiveConvFunctor { size_t outputChannels, size_t outputHeight, size_t outputWidth, - size_t padding, - size_t stride) { + size_t paddingH, + size_t paddingW, + size_t strideH, + size_t strideW) { for (size_t batch = 0; batch < batchSize; batch++) { for (size_t outC = 0; outC < outputChannels; outC++) { for (size_t outH = 0; outH < outputHeight; outH++) { for (size_t outW = 0; outW < outputWidth; outW++) { - const int inStartH = (outH * stride) - padding; - const int inStartW = (outW * stride) - padding; + const int inStartH = (outH * strideH) - paddingH; + const int inStartW = (outW * strideW) - paddingW; T outValue = (T)0; for (size_t inC = 0; inC < inputChannels; inC++) { for (size_t fH = 0; fH < filterHeight; fH++) { @@ -118,8 +120,10 @@ class NaiveConvFunction : public ConvFunctionBase { outputChannels, outputHeight, outputWidth, - padding_, - stride_); + paddingH(), + paddingW(), + strideH(), + strideW()); } }; From 3ce974b94eb582f519110a63e03a89cb6c142de7 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 31 May 2017 13:01:45 +0800 Subject: [PATCH 07/24] Add group argument in ConvFunctionBase --- paddle/function/ConvOp.h | 6 +++ paddle/function/ConvOpTest.cpp | 1 + paddle/function/GemmConvOp.cpp | 76 +++++++++++++++++++--------------- 3 files changed, 49 insertions(+), 34 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 465db57ae7d82..173ca228096d9 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -38,6 +38,7 @@ class ConvFunctionBase : public FunctionBase { // function arguments strides_ = config.get>("strides"); paddings_ = config.get>("paddings"); + groups_ = config.get("groups"); // number of inputs and outputs numInputs_ = 2; @@ -62,6 +63,11 @@ class ConvFunctionBase : public FunctionBase { protected: std::vector strides_; std::vector paddings_; + /// Group size, refer to grouped convolution in + /// Alex Krizhevsky's paper: when group=2, the first half of the + /// filters are only connected to the first half of the input channels, + /// and the second half only connected to the second half. + size_t groups_; inline int strideH() const { return strides_[0]; } inline int strideW() const { return strides_[1]; } diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index db8d9fa9da460..eb0084804814c 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -55,6 +55,7 @@ class ConvolutionTest { FuncConfig() .set("paddings", paddings) .set("strides", strides) + .set("groups", (size_t)1) .set("algo", algo)); TensorShape shape0{ diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 42786e44e0e97..b8e44cc60bce4 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -101,49 +101,57 @@ class GemmConvFunction : public ConvFunctionBase { size_t outputHeight = outputs[0].shape()[2]; size_t outputWidth = outputs[0].shape()[3]; + CHECK_EQ(inputChannels / groups_, inputs[1].shape()[1]); + real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); - size_t size = - inputChannels * filterHeight * filterWidth * outputHeight * outputWidth; + size_t size = inputChannels / groups_ * filterHeight * filterWidth * + outputHeight * outputWidth; resizeBuffer(size); real* colData = reinterpret_cast(memory_->getBuf()); Im2ColFunctor im2col; + size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t outputOffset = + (outputChannels / groups_) * outputHeight * outputWidth; + size_t filterOffset = inputs[1].shape().getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { - im2col(inputData, - inputChannels, - inputHeight, - inputWidth, - filterHeight, - filterWidth, - strideH(), - strideW(), - paddingH(), - paddingW(), - outputHeight, - outputWidth, - colData); - - int M = outputChannels; - int N = outputHeight * outputWidth; - int K = inputChannels * filterHeight * filterWidth; - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData, - K, - colData, - N, - 0.0f, - outputData, - N); - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; + for (int g = 0; g < groups_; g++) { + im2col(inputData + g * inputOffset, + inputChannels / groups_, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideH(), + strideW(), + paddingH(), + paddingW(), + outputHeight, + outputWidth, + colData); + + int M = outputChannels; + int N = outputHeight * outputWidth; + int K = inputChannels * filterHeight * filterWidth; + gemm(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + colData, + N, + 0.0f, + outputData + g * outputOffset, + N); + inputData += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } } } From 3c0aa0cc34368e5561a856e10df906f2c2efe68d Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 2 Jun 2017 16:59:52 +0800 Subject: [PATCH 08/24] Add GPU GemmConvFunction implementation --- paddle/function/ConvOp.h | 2 + paddle/function/ConvOpTest.cpp | 26 +++++--- paddle/function/GemmConvOp.cpp | 34 +++++------ paddle/function/GemmConvOp.h | 44 +++++++++++++ paddle/function/GemmConvOpGpu.cu | 93 ++++++++++++++++++++++++++++ paddle/function/GemmFunctor.h | 102 +++++++++++++++++++++++++++++++ 6 files changed, 274 insertions(+), 27 deletions(-) create mode 100644 paddle/function/GemmConvOp.h create mode 100644 paddle/function/GemmConvOpGpu.cu create mode 100644 paddle/function/GemmFunctor.h diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 173ca228096d9..017d4e26f2b7b 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include "Function.h" namespace paddle { diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index eb0084804814c..896267141337f 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -19,8 +19,7 @@ limitations under the License. */ namespace paddle { -typedef Compare2Function Compare2CpuFunction; - +template class ConvolutionTest { public: ConvolutionTest(const std::string& conv1, @@ -50,13 +49,14 @@ class ConvolutionTest { std::vector paddings = {padding, padding}; std::vector strides = {stride, stride}; - Compare2CpuFunction test(conv1, - conv2, - FuncConfig() - .set("paddings", paddings) - .set("strides", strides) - .set("groups", (size_t)1) - .set("algo", algo)); + Compare2Function test( + conv1, + conv2, + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)1) + .set("algo", algo)); TensorShape shape0{ batchSize, inputChannels, inputSize, inputSize}; @@ -79,7 +79,13 @@ class ConvolutionTest { }; TEST(Convolution, GEMM) { - ConvolutionTest test("NaiveConv-CPU", "GemmConv-CPU"); + ConvolutionTest test("NaiveConv-CPU", + "GemmConv-CPU"); +} + +TEST(Convolution, GEMM2) { + ConvolutionTest test("GemmConv-CPU", + "GemmConv-GPU"); } } // namespace paddle diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index b8e44cc60bce4..6857fe7482497 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "ConvOp.h" -#include "paddle/math/MathFunctions.h" +#include "GemmConvOp.h" +#include "GemmFunctor.h" #include "paddle/math/MemoryHandle.h" namespace paddle { @@ -24,7 +24,7 @@ namespace paddle { * output_height, output_width] */ template -class Im2ColFunctor { +class Im2ColFunctor { public: void operator()(const T* imData, int inputChannels, @@ -112,7 +112,8 @@ class GemmConvFunction : public ConvFunctionBase { resizeBuffer(size); real* colData = reinterpret_cast(memory_->getBuf()); - Im2ColFunctor im2col; + Im2ColFunctor im2col; + GemmFunctor gemm; size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -136,19 +137,17 @@ class GemmConvFunction : public ConvFunctionBase { int M = outputChannels; int N = outputHeight * outputWidth; int K = inputChannels * filterHeight * filterWidth; - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData + g * filterOffset, - K, - colData, - N, - 0.0f, - outputData + g * outputOffset, - N); + gemm(M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + colData, + N, + 0.0f, + outputData + g * outputOffset, + N); inputData += inputChannels * inputHeight * inputWidth; outputData += outputChannels * outputHeight * outputWidth; } @@ -166,5 +165,6 @@ class GemmConvFunction : public ConvFunctionBase { }; REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); +REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction); } // namespace paddle diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h new file mode 100644 index 0000000000000..652a64afba4a5 --- /dev/null +++ b/paddle/function/GemmConvOp.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "ConvOp.h" + +namespace paddle { + +/* + * imData = [input_channels, input_height, input_width] + * colData = [input_channels, filter_height, filter_width, + * output_height, output_width] + */ +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* colData); +}; + +} // namespace paddle diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu new file mode 100644 index 0000000000000..06b9904261cd7 --- /dev/null +++ b/paddle/function/GemmConvOpGpu.cu @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ConvOp.h" +#include "GemmConvOp.h" + +namespace paddle { + +template +__global__ +void im2col(const T* data_im, int numOuts, int height, int width, + int blockH, int blockW, + int strideH, int strideW, + int paddingH, int paddingW, + int height_col, int width_col, + T* data_col) { + int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < numOuts) { + int w_out = index % width_col; + index /= width_col; + int h_out = index % height_col; + int channel_in = index / height_col; + int channel_out = channel_in * blockH * blockW; + int h_in = h_out * strideH; + int w_in = w_out * strideW; + + data_col += (channel_out * height_col + h_out) * width_col + w_out; + for (int i = 0; i < blockH; ++i) { + for (int j = 0; j < blockW; ++j) { + int rIdx = int(h_in+i); + int cIdx = int(w_in+j); + if ((rIdx-(int)paddingH) >= (int)height || + (rIdx-(int)paddingH) < 0 || + (cIdx-(int)paddingW) >= (int)width || + (cIdx-(int)paddingW) < 0) { + *data_col = 0; + } else { + rIdx = rIdx + channel_in*height - paddingH; + cIdx = cIdx - paddingW; + *data_col = data_im[rIdx* width + cIdx]; + } + data_col += height_col * width_col; + } + } + } +} + +template +class Im2ColFunctor { +public: + void operator()(const T* imData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* colData) { + int numKernels = inputChannels * outputHeight * outputWidth; + int blocks = (numKernels + 1024 -1) / 1024; + int blockX = 512; + int blockY = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + im2col<<< grid, threads, 0, STREAM_DEFAULT >>> + (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth, + strideHeight, strideWidth, paddingHeight, paddingWidth, + outputHeight, outputWidth, colData); + CHECK_SYNC("Im2ColFunctor GPU failed"); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; + +} // namespace paddle diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h new file mode 100644 index 0000000000000..5fb2f8a6d9e8f --- /dev/null +++ b/paddle/function/GemmFunctor.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" + +namespace paddle { + +// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the +// cblas_dgemm interface's parameter format, it is necessary to introduce +// GemmFunctor as a new interface. Later, when considering the implementation +// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul +// interface. +template +class GemmFunctor { +public: + void operator()(const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc); +}; + +template +class GemmFunctor { +public: + void operator()(const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc) { + gemm(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); + } +}; + +template +class GemmFunctor { +public: + void operator()(const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc) { + hl_matrix_mul((T*)A, + HPPL_OP_N, + (T*)B, + HPPL_OP_N, + C, + M, + N, + K, + alpha, + beta, + lda, + ldb, + ldc); + } +}; + +} // namespace paddle From c70d3e1af85f960555fcdcd5341453e277c326ec Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 2 Jun 2017 18:17:20 +0800 Subject: [PATCH 09/24] Some bug fix --- paddle/function/GemmConvOp.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 6857fe7482497..e7a93ae676ff7 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -96,7 +96,7 @@ class GemmConvFunction : public ConvFunctionBase { size_t inputHeight = inputs[0].shape()[2]; size_t inputWidth = inputs[0].shape()[3]; size_t filterHeight = inputs[1].shape()[2]; - size_t filterWidth = inputs[1].shape()[2]; + size_t filterWidth = inputs[1].shape()[3]; size_t outputChannels = outputs[0].shape()[1]; size_t outputHeight = outputs[0].shape()[2]; size_t outputWidth = outputs[0].shape()[3]; @@ -148,23 +148,29 @@ class GemmConvFunction : public ConvFunctionBase { 0.0f, outputData + g * outputOffset, N); - inputData += inputChannels * inputHeight * inputWidth; - outputData += outputChannels * outputHeight * outputWidth; } + inputData += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; } } void resizeBuffer(size_t newSize) { if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) { - memory_ = std::make_shared(newSize * sizeof(real)); + if (Device == DEVICE_TYPE_CPU) { + memory_ = std::make_shared(newSize * sizeof(real)); + } else { + memory_ = std::make_shared(newSize * sizeof(real)); + } } } private: - CpuMemHandlePtr memory_; + MemoryHandlePtr memory_; }; REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); +#ifndef PADDLE_ONLY_CPU REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction); +#endif } // namespace paddle From 3408b4b2f409a5a8191248c7c17e1c882779de27 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 2 Jun 2017 19:52:08 +0800 Subject: [PATCH 10/24] Bug fix --- paddle/function/ConvOp.h | 2 +- paddle/function/ConvOpTest.cpp | 2 ++ paddle/function/GemmConvOp.cpp | 6 ++---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 017d4e26f2b7b..14c20b74f2007 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -58,7 +58,7 @@ class ConvFunctionBase : public FunctionBase { CHECK_EQ(outputs[0].shape().ndims(), (size_t)4); CHECK(inputs[0].shape()[0] == outputs[0].shape()[0]); - CHECK(inputs[0].shape()[1] == inputs[1].shape()[1]); + CHECK(inputs[0].shape()[1] / groups_ == inputs[1].shape()[1]); CHECK(outputs[0].shape()[1] == inputs[1].shape()[0]); } diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index 896267141337f..d9de211448879 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -83,9 +83,11 @@ TEST(Convolution, GEMM) { "GemmConv-CPU"); } +#ifndef PADDLE_ONLY_CPU TEST(Convolution, GEMM2) { ConvolutionTest test("GemmConv-CPU", "GemmConv-GPU"); } +#endif } // namespace paddle diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index e7a93ae676ff7..78aa8f14f3445 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -101,8 +101,6 @@ class GemmConvFunction : public ConvFunctionBase { size_t outputHeight = outputs[0].shape()[2]; size_t outputWidth = outputs[0].shape()[3]; - CHECK_EQ(inputChannels / groups_, inputs[1].shape()[1]); - real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); real* outputData = outputs[0].data(); @@ -134,9 +132,9 @@ class GemmConvFunction : public ConvFunctionBase { outputWidth, colData); - int M = outputChannels; + int M = outputChannels / groups_; int N = outputHeight * outputWidth; - int K = inputChannels * filterHeight * filterWidth; + int K = inputChannels / groups_ * filterHeight * filterWidth; gemm(M, N, K, From afbe556e565a880c40a60ea90e21debd4cb139bd Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 5 Jun 2017 17:08:34 +0800 Subject: [PATCH 11/24] Modify the arguments description of ConvFunctionBase. And add the definition of backward input and backward filter function. --- paddle/function/ConvOp.h | 70 ++++++++++++++++++++---------- paddle/function/GemmConvOp.cpp | 75 +++++++++++++++++++++++++++------ paddle/function/NaiveConvOp.cpp | 7 ++- 3 files changed, 114 insertions(+), 38 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 14c20b74f2007..8f2c0c4cb8ab4 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -19,20 +19,36 @@ limitations under the License. */ namespace paddle { /* - * Function Arguments: + * \brief Based on the ConvFunctionBase class, the forward calculation, + * backward input calculation and backward filter calculation + * of convolution operations can be implemented. * - * \param inputs[0] Input image data, is NCHW format, where N is batch size, - * C is the number of channels, H and W is the height and - * width of input image. - * \param inputs[1] Filter data, is MCHW, where M is the number of output - * channels, C is the number of input channels, H and W - * is height and width of filter. - * \param outputs[0] Output image data, is NCHW format, where N is batch size, - * C is the number of channels, H and W is the height and - * width of output image. + * Arguments of forward and backward calculation: + * 1. Forward calculation of convolution. + * inputs = {INPUT, FILTER}, outputs = {OUTPUT} + * The first and second input arguments are input image and filter data. + * The output argument is output image. * - * \note Implemented based on the ConvFunctionBase class only supports - * input data in the NCHW format. + * 2. Backward input calculation of convolution. + * inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD} + * The first and second input arguments are output grad image + * and filter data. + * The output argument is input grad image. + * + * 3. Backward filter calculation of convolution. + * inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD} + * The first and second input arguments are output grad image + * and input image. + * The output argument is filter grad. + * + * Arguments format of input, filter and output: + * 1. Input image, output image, input image gradient, output image gradient + * are all NCHW format. Where N is batch size, C is the number of channels, + * H and W is the height and width of image or image gradient. + * + * 2. The format of the filter data is MCHW, where M is the number of + * output image channels, C is the number of input image channels, + * H and W is height and width of filter. */ class ConvFunctionBase : public FunctionBase { public: @@ -49,17 +65,25 @@ class ConvFunctionBase : public FunctionBase { virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {} - void check(const BufferArgs& inputs, const BufferArgs& outputs) override { - CHECK_EQ(numInputs_, inputs.size()); - CHECK_EQ(numOutputs_, outputs.size()); - - CHECK_EQ(inputs[0].shape().ndims(), (size_t)4); - CHECK_EQ(inputs[1].shape().ndims(), (size_t)4); - CHECK_EQ(outputs[0].shape().ndims(), (size_t)4); - - CHECK(inputs[0].shape()[0] == outputs[0].shape()[0]); - CHECK(inputs[0].shape()[1] / groups_ == inputs[1].shape()[1]); - CHECK(outputs[0].shape()[1] == inputs[1].shape()[0]); + // input can be INPUT and INPUT_GRAD + // filter can be FILTER and FILTER_GRAD + // output can be OUTPUT and OUTPUT_GRAD + void check(const TensorShape& input, + const TensorShape& filter, + const TensorShape& output) { + // inputs and outputs arguments should be 4-dimensional. + CHECK_EQ(input.ndims(), (size_t)4); + CHECK_EQ(filter.ndims(), (size_t)4); + CHECK_EQ(output.ndims(), (size_t)4); + + // The batchSize of the input needs to be equal to + // the batchSize of the output. + CHECK_EQ(input[0], output[0]); + + // The input and output channel dimensions are the second and first + // dimensions of the filter shape. + CHECK_EQ(input[1] / groups_, filter[1]); + CHECK_EQ(output[1], filter[0]); } protected: diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 78aa8f14f3445..109ed20ab0666 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -68,17 +68,7 @@ class Im2ColFunctor { }; /* - * Function Arguments: - * - * \param inputs[0] Input image data, is NCHW format, where N is batch size, - * C is the number of channels, H and W is the height and - * width of input image. - * \param inputs[1] Filter data, is MCHW, where M is the number of output - * channels, C is the number of input channels, H and W - * is height and width of filter. - * \param outputs[0] Output image data, is NCHW format, where N is batch size, - * C is the number of channels, H and W is the height and - * width of output image. + * \brief Forward calculation of convolution. */ template class GemmConvFunction : public ConvFunctionBase { @@ -88,8 +78,21 @@ class GemmConvFunction : public ConvFunctionBase { } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { - check(inputs, outputs); - CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + // TODO(hedaoyuan): Need to define some index macros, + // to avoid useing 0 and 1. + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + check(input, filter, output); + + real beta; + if (outputs[0].getArgType() == ADD_TO) { + beta = 1.0; + } else { + beta = 0.0; + } size_t batchSize = inputs[0].shape()[0]; size_t inputChannels = inputs[0].shape()[1]; @@ -143,7 +146,7 @@ class GemmConvFunction : public ConvFunctionBase { K, colData, N, - 0.0f, + beta, outputData + g * outputOffset, N); } @@ -166,9 +169,53 @@ class GemmConvFunction : public ConvFunctionBase { MemoryHandlePtr memory_; }; +/* + * \brief Backward input calculation of convolution. + */ +template +class GemmConvGradInputFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + const TensorShape& outputGrad = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& inputGrad = outputs[0].shape(); + check(inputGrad, filter, outputGrad); + } +}; + +/* + * \brief Backward filter calculation of convolution. + */ +template +class GemmConvGradFilterFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + const TensorShape& outputGrad = inputs[0].shape(); + const TensorShape& input = inputs[1].shape(); + const TensorShape& filterGrad = outputs[0].shape(); + check(input, filterGrad, outputGrad); + } +}; + REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); +REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); +REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); #ifndef PADDLE_ONLY_CPU REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction); +REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction); +REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction); #endif } // namespace paddle diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp index f5d2aa16ab9b8..94aba253e3e56 100644 --- a/paddle/function/NaiveConvOp.cpp +++ b/paddle/function/NaiveConvOp.cpp @@ -91,7 +91,12 @@ class NaiveConvFunction : public ConvFunctionBase { } void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { - check(inputs, outputs); + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + check(input, filter, output); CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); size_t batchSize = inputs[0].shape()[0]; From 6a93f0f37a7e4463ed2b1ed19a738a3ceeb3d04a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 5 Jun 2017 21:17:46 +0800 Subject: [PATCH 12/24] Add the calculation implementation of GemmConvGradFilterFunction --- paddle/function/ConvOp.h | 16 ++++++ paddle/function/GemmConvOp.cpp | 90 +++++++++++++++++++++++++++------- paddle/function/GemmFunctor.h | 30 +++++------- 3 files changed, 99 insertions(+), 37 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 8f2c0c4cb8ab4..9ad1785fbb47f 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -89,11 +89,13 @@ class ConvFunctionBase : public FunctionBase { protected: std::vector strides_; std::vector paddings_; + /// Group size, refer to grouped convolution in /// Alex Krizhevsky's paper: when group=2, the first half of the /// filters are only connected to the first half of the input channels, /// and the second half only connected to the second half. size_t groups_; + inline int strideH() const { return strides_[0]; } inline int strideW() const { return strides_[1]; } @@ -101,6 +103,20 @@ class ConvFunctionBase : public FunctionBase { inline int paddingH() const { return paddings_[0]; } inline int paddingW() const { return paddings_[1]; } + + // A temporary memory in convolution calculation. + MemoryHandlePtr memory_; + + template + void resizeBuffer(size_t newSize) { + if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) { + if (Device == DEVICE_TYPE_CPU) { + memory_ = std::make_shared(newSize * sizeof(real)); + } else { + memory_ = std::make_shared(newSize * sizeof(real)); + } + } + } }; } // namespace paddle diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 109ed20ab0666..6b5db1d62ed40 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -110,7 +110,7 @@ class GemmConvFunction : public ConvFunctionBase { size_t size = inputChannels / groups_ * filterHeight * filterWidth * outputHeight * outputWidth; - resizeBuffer(size); + resizeBuffer(size); real* colData = reinterpret_cast(memory_->getBuf()); Im2ColFunctor im2col; @@ -120,7 +120,7 @@ class GemmConvFunction : public ConvFunctionBase { (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = inputs[1].shape().getElements() / groups_; for (size_t i = 0; i < batchSize; i++) { - for (int g = 0; g < groups_; g++) { + for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, inputChannels / groups_, inputHeight, @@ -138,7 +138,9 @@ class GemmConvFunction : public ConvFunctionBase { int M = outputChannels / groups_; int N = outputHeight * outputWidth; int K = inputChannels / groups_ * filterHeight * filterWidth; - gemm(M, + gemm(CblasNoTrans, + CblasNoTrans, + M, N, K, 1.0f, @@ -154,19 +156,6 @@ class GemmConvFunction : public ConvFunctionBase { outputData += outputChannels * outputHeight * outputWidth; } } - - void resizeBuffer(size_t newSize) { - if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) { - if (Device == DEVICE_TYPE_CPU) { - memory_ = std::make_shared(newSize * sizeof(real)); - } else { - memory_ = std::make_shared(newSize * sizeof(real)); - } - } - } - -private: - MemoryHandlePtr memory_; }; /* @@ -202,10 +191,73 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); - const TensorShape& outputGrad = inputs[0].shape(); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + const TensorShape& output = inputs[0].shape(); const TensorShape& input = inputs[1].shape(); - const TensorShape& filterGrad = outputs[0].shape(); - check(input, filterGrad, outputGrad); + const TensorShape& filter = outputs[0].shape(); + check(input, filter, output); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = filter[2]; + size_t filterWidth = filter[3]; + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + + real* outputGrad = inputs[0].data(); + real* inputData = inputs[1].data(); + real* filterGrad = outputs[0].data(); + + size_t size = inputChannels / groups_ * filterHeight * filterWidth * + outputHeight * outputWidth; + resizeBuffer(size); + real* colData = reinterpret_cast(memory_->getBuf()); + + Im2ColFunctor im2col; + GemmFunctor gemm; + size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t outputOffset = + (outputChannels / groups_) * outputHeight * outputWidth; + size_t filterOffset = filter.getElements() / groups_; + for (size_t i = 0; i < batchSize; i++) { + for (size_t g = 0; g < groups_; g++) { + im2col(inputData + g * inputOffset, + inputChannels / groups_, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideH(), + strideW(), + paddingH(), + paddingW(), + outputHeight, + outputWidth, + colData); + + int M = outputChannels / groups_; + int K = outputHeight * outputWidth; + int N = inputChannels / groups_ * filterHeight * filterWidth; + gemm(CblasNoTrans, + CblasTrans, + M, + N, + K, + 1.0f, + outputGrad + g * outputOffset, + K, + colData, + K, + 1.0f, + filterGrad + g * filterOffset, + N); + } + } + inputData += inputChannels * inputHeight * inputWidth; + outputGrad += outputChannels * outputHeight * outputWidth; } }; diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h index 5fb2f8a6d9e8f..d5db5cf5e7a85 100644 --- a/paddle/function/GemmFunctor.h +++ b/paddle/function/GemmFunctor.h @@ -26,7 +26,9 @@ namespace paddle { template class GemmFunctor { public: - void operator()(const int M, + void operator()(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, @@ -42,7 +44,9 @@ class GemmFunctor { template class GemmFunctor { public: - void operator()(const int M, + void operator()(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, @@ -53,26 +57,16 @@ class GemmFunctor { const T beta, T* C, const int ldc) { - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); + gemm(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } }; template class GemmFunctor { public: - void operator()(const int M, + void operator()(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, @@ -84,9 +78,9 @@ class GemmFunctor { T* C, const int ldc) { hl_matrix_mul((T*)A, - HPPL_OP_N, + transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T, (T*)B, - HPPL_OP_N, + TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T, C, M, N, From 90326198e929772fe3e87fe5c067f057927f7f64 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 5 Jun 2017 21:35:10 +0800 Subject: [PATCH 13/24] Bug fix & add test of GemmConvGradFilter. --- paddle/function/ConvOpTest.cpp | 53 +++++++++++++++++++++++++--------- paddle/function/GemmConvOp.cpp | 4 +-- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index d9de211448879..e2997df0128e1 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -19,11 +19,18 @@ limitations under the License. */ namespace paddle { +enum TestType { + FORWARD_TEST = 0, + BACKWARD_INPUT_TEST = 1, + BACKWARD_FILTER_TEST = 2, +}; + template class ConvolutionTest { public: ConvolutionTest(const std::string& conv1, const std::string& conv2, + TestType type, std::string algo = "auto") { for (size_t batchSize : {1, 32}) { for (size_t inputSize : {7, 14, 54}) { @@ -58,16 +65,31 @@ class ConvolutionTest { .set("groups", (size_t)1) .set("algo", algo)); - TensorShape shape0{ + TensorShape input{ batchSize, inputChannels, inputSize, inputSize}; - TensorShape shape1{ + TensorShape filter{ outputChannels, inputChannels, filterSize, filterSize}; - TensorShape shape2{ + TensorShape output{ batchSize, outputChannels, outputSize, outputSize}; - test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0)); - test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1)); - test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2)); - test.run(); + + if (type == FORWARD_TEST) { + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.run(); + } else if (type == BACKWARD_INPUT_TEST) { +#if 0 + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.run(); +#endif + } else if (type == BACKWARD_FILTER_TEST) { + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.run(); + } } } } @@ -78,15 +100,20 @@ class ConvolutionTest { } }; -TEST(Convolution, GEMM) { - ConvolutionTest test("NaiveConv-CPU", - "GemmConv-CPU"); +TEST(Forward, GEMM) { + ConvolutionTest test( + "NaiveConv-CPU", "GemmConv-CPU", FORWARD_TEST); } #ifndef PADDLE_ONLY_CPU -TEST(Convolution, GEMM2) { - ConvolutionTest test("GemmConv-CPU", - "GemmConv-GPU"); +TEST(Forward, GEMM2) { + ConvolutionTest test( + "GemmConv-CPU", "GemmConv-GPU", FORWARD_TEST); +} + +TEST(BackwardFilter, GEMM) { + ConvolutionTest test( + "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", BACKWARD_FILTER_TEST); } #endif diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 6b5db1d62ed40..414c7a885b67b 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -255,9 +255,9 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { filterGrad + g * filterOffset, N); } + inputData += inputChannels * inputHeight * inputWidth; + outputGrad += outputChannels * outputHeight * outputWidth; } - inputData += inputChannels * inputHeight * inputWidth; - outputGrad += outputChannels * outputHeight * outputWidth; } }; From d99faf310865fe500083f0db53063e53efd2731f Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 6 Jun 2017 12:51:30 +0800 Subject: [PATCH 14/24] Add the calculation implementation of GemmConvGradInputFunction. --- paddle/function/ConvOpTest.cpp | 7 +- paddle/function/GemmConvOp.cpp | 142 +++++++++++++++++++++++++++---- paddle/function/GemmConvOp.h | 18 ++++ paddle/function/GemmConvOpGpu.cu | 93 ++++++++++++++++++++ 4 files changed, 242 insertions(+), 18 deletions(-) diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index e2997df0128e1..2fa0b36546514 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -78,12 +78,10 @@ class ConvolutionTest { test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.run(); } else if (type == BACKWARD_INPUT_TEST) { -#if 0 test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input)); test.run(); -#endif } else if (type == BACKWARD_FILTER_TEST) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); @@ -111,6 +109,11 @@ TEST(Forward, GEMM2) { "GemmConv-CPU", "GemmConv-GPU", FORWARD_TEST); } +TEST(BackwardInput, GEMM) { + ConvolutionTest test( + "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", BACKWARD_INPUT_TEST); +} + TEST(BackwardFilter, GEMM) { ConvolutionTest test( "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", BACKWARD_FILTER_TEST); diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 414c7a885b67b..bb7bc6477927e 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -44,22 +44,62 @@ class Im2ColFunctor { for (int c = 0; c < channelsCol; ++c) { int wOffset = c % filterWidth; int hOffset = (c / filterWidth) % filterHeight; - int c_im = c / filterHeight / filterWidth; + int c_im = c / filterWidth / filterHeight; for (int h = 0; h < outputHeight; ++h) { for (int w = 0; w < outputWidth; ++w) { - // no c_im*height to Exclude the channel number - int imgRowIdx = h * strideHeight + hOffset; - int imgColIdx = w * strideWidth + wOffset; - if ((imgRowIdx - paddingHeight) < 0 || - (imgRowIdx - paddingHeight) >= inputHeight || - (imgColIdx - paddingWidth) < 0 || - (imgColIdx - paddingWidth) >= inputWidth) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) < 0 || + (imRowIdx - paddingHeight) >= inputHeight || + (imColIdx - paddingWidth) < 0 || + (imColIdx - paddingWidth) >= inputWidth) { colData[(c * outputHeight + h) * outputWidth + w] = T(0); } else { - imgRowIdx += c_im * inputHeight - paddingHeight; - imgColIdx -= paddingWidth; + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; colData[(c * outputHeight + h) * outputWidth + w] = - imData[imgRowIdx * inputWidth + imgColIdx]; + imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + } + } +}; + +template +class Col2ImFunctor { +public: + void operator()(const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* imData) { + int channelsCol = inputChannels * filterHeight * filterWidth; + + for (int c = 0; c < channelsCol; ++c) { + int wOffset = c % filterWidth; + int hOffset = (c / filterWidth) % filterHeight; + int c_im = c / filterWidth / filterHeight; + for (int h = 0; h < outputHeight; ++h) { + for (int w = 0; w < outputWidth; ++w) { + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) >= 0 && + (imRowIdx - paddingHeight) < inputHeight && + (imColIdx - paddingWidth) >= 0 && + (imColIdx - paddingWidth) < inputWidth) { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + imData[imRowIdx * inputWidth + imColIdx] += + colData[(c * outputHeight + h) * outputWidth + w]; } } } @@ -171,10 +211,74 @@ class GemmConvGradInputFunction : public ConvFunctionBase { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); - const TensorShape& outputGrad = inputs[0].shape(); + // CHECK_EQ(outputs[0].getArgType(), ADD_TO); + const TensorShape& output = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); - const TensorShape& inputGrad = outputs[0].shape(); - check(inputGrad, filter, outputGrad); + const TensorShape& input = outputs[0].shape(); + check(input, filter, output); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = filter[2]; + size_t filterWidth = filter[3]; + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + + real* outputGrad = inputs[0].data(); + real* filterData = inputs[1].data(); + real* inputGrad = outputs[0].data(); + + size_t size = inputChannels / groups_ * filterHeight * filterWidth * + outputHeight * outputWidth; + resizeBuffer(size); + real* colData = reinterpret_cast(memory_->getBuf()); + + Col2ImFunctor col2im; + GemmFunctor gemm; + size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; + size_t outputOffset = + (outputChannels / groups_) * outputHeight * outputWidth; + size_t filterOffset = filter.getElements() / groups_; + + for (size_t i = 0; i < batchSize; i++) { + for (size_t g = 0; g < groups_; g++) { + int K = outputChannels / groups_; + int N = outputHeight * outputWidth; + int M = inputChannels / groups_ * filterHeight * filterWidth; + gemm(CblasTrans, + CblasNoTrans, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + M, + outputGrad + g * outputOffset, + N, + 0.0f, + colData, + N); + + col2im(colData, + inputChannels / groups_, + inputHeight, + inputWidth, + filterHeight, + filterWidth, + strideH(), + strideW(), + paddingH(), + paddingW(), + outputHeight, + outputWidth, + inputGrad + g * inputOffset); + } + inputGrad += inputChannels * inputHeight * inputWidth; + outputGrad += outputChannels * outputHeight * outputWidth; + } } }; @@ -191,12 +295,18 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); - CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); const TensorShape& output = inputs[0].shape(); const TensorShape& input = inputs[1].shape(); const TensorShape& filter = outputs[0].shape(); check(input, filter, output); + real beta; + if (outputs[0].getArgType() == ADD_TO) { + beta = 1.0; + } else { + beta = 0.0; + } + size_t batchSize = input[0]; size_t inputChannels = input[1]; size_t inputHeight = input[2]; @@ -251,7 +361,7 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { K, colData, K, - 1.0f, + i == 0 ? beta : 1.0f, filterGrad + g * filterOffset, N); } diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h index 652a64afba4a5..9f11cce597a07 100644 --- a/paddle/function/GemmConvOp.h +++ b/paddle/function/GemmConvOp.h @@ -41,4 +41,22 @@ class Im2ColFunctor { T* colData); }; +template +class Col2ImFunctor { +public: + void operator()(const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* imData); +}; + } // namespace paddle diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu index 06b9904261cd7..2a1795ff0fb56 100644 --- a/paddle/function/GemmConvOpGpu.cu +++ b/paddle/function/GemmConvOpGpu.cu @@ -87,7 +87,100 @@ public: } }; +template +__global__ +void col2im(size_t n, const T* data_col, size_t height, + size_t width, size_t channels, + size_t blockH, size_t blockW, + size_t strideH, size_t strideW, + size_t paddingH, size_t paddingW, + size_t height_col, size_t width_col, + T* data_im) { + size_t index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < n) { + T val = 0; + int w = int(index % width); + int h = int((index / width) % height); + int c = int(index / (width * height)); + if ((w - (int)paddingW) >= 0 && + (w - (int)paddingW) < (width-2 * paddingW) && + (h - (int)paddingH) >= 0 && + (h - paddingH) < (height - 2 * paddingH)) { + // compute the start and end of the output + int w_col_start = + (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; + int w_col_end = + min((int)(w / (int)strideW + 1), (int)(width_col)); + int h_col_start = + (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; + int h_col_end = min(int(h / strideH + 1), int(height_col)); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + // the col location: [c * width * height + h_out, w_out] + int c_col = int(c * blockH* blockW) + \ + (h - h_col * (int)strideH) * (int)blockW + + (w - w_col * (int)strideW); + val += data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + h -= paddingH; + w -= paddingW; + data_im[c*((width-2*paddingW) * (height-2*paddingH)) + + h*(width-2*paddingW) + w] += val; + } + } +} + +template +class Col2ImFunctor { +public: + void operator()(const T* colData, + int inputChannels, + int inputHeight, + int inputWidth, + int filterHeight, + int filterWidth, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int outputHeight, + int outputWidth, + T* imData) { + size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight) + * (inputWidth + 2*paddingWidth); + + size_t blocks = (numKernels + 1024 -1) / 1024; + size_t blockX = 512; + size_t blockY = (blocks+512-1)/512; + dim3 threads(1024, 1); + dim3 grid(blockX, blockY); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<< grid, threads, 0, STREAM_DEFAULT >>> + (numKernels, + colData, + inputHeight + 2*paddingHeight, + inputWidth + 2*paddingWidth, + inputChannels, + filterHeight, + filterWidth, + strideHeight, + strideWidth, + paddingHeight, + paddingWidth, + outputHeight, + outputWidth, + imData); + CHECK_SYNC("Col2ImFunctor GPU failed"); + } +}; + template class Im2ColFunctor; template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; } // namespace paddle From 9885c578d758b2cdb9334e4905a4842283985f5a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 6 Jun 2017 12:53:39 +0800 Subject: [PATCH 15/24] format --- paddle/function/GemmConvOp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index bb7bc6477927e..9dab831d32cef 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -239,7 +239,7 @@ class GemmConvGradInputFunction : public ConvFunctionBase { Col2ImFunctor col2im; GemmFunctor gemm; size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; - size_t outputOffset = + size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; size_t filterOffset = filter.getElements() / groups_; From 7aac38c794ffb1df10db4d61ac4350a32bdc62ed Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 6 Jun 2017 15:45:20 +0800 Subject: [PATCH 16/24] Refactoring the code implementation of exconv adn exconvt layer with GemmConvFunction. --- paddle/gserver/layers/ConvBaseLayer.cpp | 6 +- paddle/gserver/layers/CudnnConvBaseLayer.cpp | 10 +- paddle/gserver/layers/ExpandConvLayer.cpp | 104 +++++++++++++++--- paddle/gserver/layers/ExpandConvLayer.h | 5 + .../gserver/layers/ExpandConvTransLayer.cpp | 2 +- 5 files changed, 98 insertions(+), 29 deletions(-) diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp index 7b234dc2a6663..e161d89c38a29 100644 --- a/paddle/gserver/layers/ConvBaseLayer.cpp +++ b/paddle/gserver/layers/ConvBaseLayer.cpp @@ -118,11 +118,7 @@ size_t ConvBaseLayer::calOutputSize() { layerSize = outH[0] * outW[0] * size_t(numFilters_); }; - if (isDeconv_) { - setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_); - } else { - setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_); - } + setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_); return layerSize; } diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp index 24363bb8b09cc..c056bbe4d1d35 100644 --- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp +++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp @@ -70,14 +70,8 @@ void CudnnConvBaseLayer::forward(PassType passType) { if (biases_) { REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str()); int batchSize = inputLayers_[0]->getOutputValue()->getHeight(); - int outH, outW; - if (isDeconv_) { - outH = imgSizeH_[0]; - outW = imgSizeW_[0]; - } else { - outH = outputH_[0]; - outW = outputW_[0]; - } + int outH = outputH_[0]; + int outW = outputW_[0]; hl_tensor_reshape(outputDesc_, batchSize, diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index f9267b81a7d42..379b2d339d47b 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -18,32 +18,90 @@ limitations under the License. */ namespace paddle { +/* + * The calculation of the exconvt(convolution transpose (deconv) operation) + * is a swap of forward and backward of the calculation of exconv. + * */ REGISTER_LAYER(exconv, ExpandConvLayer); +REGISTER_LAYER(exconvt, ExpandConvLayer); bool ExpandConvLayer::init(const LayerMap &layerMap, const ParameterMap ¶meterMap) { /* Initialize the basic convolutional parent class */ ExpandConvBaseLayer::init(layerMap, parameterMap); + + size_t numInputs = config_.inputs_size(); + inputShape_.resize(numInputs); + filterShape_.resize(numInputs); + outputShape_.resize(numInputs); + for (int i = 0; i < config_.inputs_size(); i++) { + std::vector paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; + std::vector strides = {(size_t)strideY_[i], (size_t)stride_[i]}; + createFunction(forward_, + !isDeconv_ ? "GemmConv" : "GemmConvGradInput", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + + createFunction(backward_, + !isDeconv_ ? "GemmConvGradInput" : "GemmConv", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + + createFunction(backward_, + "GemmConvGradFilter", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + } return true; } +// i is the index of input layers +#define BACKWARD_INPUT(i, inputs, outputs) \ + backward_[2 * i]->calc(inputs, outputs) +#define BACKWARD_FILTER(i, inputs, outputs) \ + backward_[2 * i + 1]->calc(inputs, outputs) + void ExpandConvLayer::forward(PassType passType) { Layer::forward(passType); - /* malloc memory for the output_ if necessary */ - int batchSize = inputLayers_[0]->getOutputValue()->getHeight(); + size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight(); resetOutput(batchSize, getOutputSize()); - MatrixPtr image = nullptr; - MatrixPtr outV = getOutputValue(); + // Calculate the shape of the input, output, and filter. for (size_t i = 0; i < inputLayers_.size(); ++i) { - LayerPtr prevLayer = getPrev(i); - image = prevLayer->getOutputValue(); - for (size_t off = 0; off < image->getHeight(); off++) { - REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str()); - expandFwdOnce(image, outV, i, off); - } + inputShape_[i] = TensorShape({(size_t)batchSize, + (size_t)channels_[i], + (size_t)imgSizeH_[i], + (size_t)imgSizeW_[i]}); + filterShape_[i] = + TensorShape({!isDeconv_ ? (size_t)numFilters_ : (size_t)channels_[i], + !isDeconv_ ? (size_t)channels_[i] : (size_t)numFilters_, + (size_t)filterSizeY_[i], + (size_t)filterSize_[i]}); + outputShape_[i] = TensorShape({(size_t)batchSize, + (size_t)numFilters_, + (size_t)outputH_[i], + (size_t)outputW_[i]}); } + + // Calculate the output value. + for (size_t i = 0; i < inputLayers_.size(); ++i) { + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(i), inputShape_[i]); + inputs.addArg(*weights_[i]->getW(), filterShape_[i]); + outputs.addArg( + *getOutputValue(), outputShape_[i], i == 0 ? ASSIGN_TO : ADD_TO); + + forward_[i]->calc(inputs, outputs); + } + /* add the bias-vector */ if (biases_.get()) { if (sharedBiases_) { @@ -67,14 +125,30 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) { biases_->getParameterPtr()->incUpdate(callback); } + // Calculate the input grad and filter grad. for (size_t i = 0; i < inputLayers_.size(); ++i) { - /* First, calculate the input layers error */ - if (getPrev(i)->getOutputGrad()) { - bpropActs(outGrad, getPrev(i)->getOutputGrad(), i); + if (getInputGrad(i)) { + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), outputShape_[i]); + inputs.addArg(*weights_[i]->getW(), filterShape_[i]); + outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO); + BACKWARD_INPUT(i, inputs, outputs); } + if (weights_[i]->getWGrad()) { - /* Then, calculate the W-gradient for the current layer */ - bpropWeights(getPrev(i)->getOutputValue(), outGrad, i); + BufferArgs inputs; + BufferArgs outputs; + if (!isDeconv_) { + inputs.addArg(*getOutputGrad(), outputShape_[i]); + inputs.addArg(*getInputValue(i), inputShape_[i]); + } else { + inputs.addArg(*getInputValue(i), inputShape_[i]); + inputs.addArg(*getOutputGrad(), outputShape_[i]); + } + outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO); + BACKWARD_FILTER(i, inputs, outputs); + /* Increasing the number of gradient */ weights_[i]->getParameterPtr()->incUpdate(callback); } diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h index 60681690e5dd5..a1f943d152154 100644 --- a/paddle/gserver/layers/ExpandConvLayer.h +++ b/paddle/gserver/layers/ExpandConvLayer.h @@ -40,6 +40,11 @@ class ExpandConvLayer : public ExpandConvBaseLayer { void forward(PassType passType) override; void backward(const UpdateCallback& callback) override; + +protected: + std::vector inputShape_; + std::vector filterShape_; + std::vector outputShape_; }; } // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp index 520586b138897..b80a01e328784 100644 --- a/paddle/gserver/layers/ExpandConvTransLayer.cpp +++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp @@ -23,7 +23,7 @@ limitations under the License. */ namespace paddle { -REGISTER_LAYER(exconvt, ExpandConvTransLayer); +// REGISTER_LAYER(exconvt, ExpandConvTransLayer); bool ExpandConvTransLayer::init(const LayerMap &layerMap, const ParameterMap ¶meterMap) { From 784e21848fea3b183cc270b49bc94668ed9d0285 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 7 Jun 2017 17:56:32 +0800 Subject: [PATCH 17/24] Fix the error of group convolution. --- paddle/function/ConvOp.h | 48 +++++++++++++++++++---- paddle/function/ConvOpTest.cpp | 2 +- paddle/function/GemmConvOp.cpp | 33 +++++++++------- paddle/gserver/layers/ExpandConvLayer.cpp | 12 ++++-- 4 files changed, 67 insertions(+), 28 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 9ad1785fbb47f..2cfc0712557e5 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -46,8 +46,13 @@ namespace paddle { * are all NCHW format. Where N is batch size, C is the number of channels, * H and W is the height and width of image or image gradient. * - * 2. The format of the filter data is MCHW, where M is the number of - * output image channels, C is the number of input image channels, + * 2. The format of the filter data is MCHW, where M is the number of output + * image channels, C is the number of input image channels, + * H and W is height and width of filter. + * + * If groups is greater than 1, the filter's data format should be GMCHW, + * where G is the groups, and G * M is the number of output image channels, + * G * C is the number of input image channels, * H and W is height and width of filter. */ class ConvFunctionBase : public FunctionBase { @@ -73,20 +78,47 @@ class ConvFunctionBase : public FunctionBase { const TensorShape& output) { // inputs and outputs arguments should be 4-dimensional. CHECK_EQ(input.ndims(), (size_t)4); - CHECK_EQ(filter.ndims(), (size_t)4); CHECK_EQ(output.ndims(), (size_t)4); - // The batchSize of the input needs to be equal to // the batchSize of the output. CHECK_EQ(input[0], output[0]); - // The input and output channel dimensions are the second and first - // dimensions of the filter shape. - CHECK_EQ(input[1] / groups_, filter[1]); - CHECK_EQ(output[1], filter[0]); + if (filter.ndims() == (size_t)4) { + // If the filter's dimension is 4, groups convolution is not supported. + CHECK_EQ(groups_, (size_t)1); + // The input and output channel dimensions are the second and first + // dimensions of the filter shape. + CHECK_EQ(input[1], filter[1]); + CHECK_EQ(output[1], filter[0]); + } else { + // filter argument should be 5-dimensional. + CHECK_EQ(filter.ndims(), (size_t)5); + // The first dimension of the filter is the size of the group + CHECK_EQ(filter[0], groups_); + // The input and output channel dimensions are the third and second + // dimensions of the filter shape. + CHECK_EQ(input[1], filter[2] * groups_); + CHECK_EQ(output[1], filter[1] * groups_); + } } protected: + size_t getFilterHeight(const TensorShape& filter) const { + if (filter.ndims() == 5) { + return filter[3]; + } else { + return filter[2]; + } + } + + size_t getFilterWidth(const TensorShape& filter) const { + if (filter.ndims() == 5) { + return filter[4]; + } else { + return filter[3]; + } + } + std::vector strides_; std::vector paddings_; diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index 2fa0b36546514..280aed8a5c8e4 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -80,7 +80,7 @@ class ConvolutionTest { } else if (type == BACKWARD_INPUT_TEST) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); - test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO); test.run(); } else if (type == BACKWARD_FILTER_TEST) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 9dab831d32cef..c7a57801ed609 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -134,15 +134,15 @@ class GemmConvFunction : public ConvFunctionBase { beta = 0.0; } - size_t batchSize = inputs[0].shape()[0]; - size_t inputChannels = inputs[0].shape()[1]; - size_t inputHeight = inputs[0].shape()[2]; - size_t inputWidth = inputs[0].shape()[3]; - size_t filterHeight = inputs[1].shape()[2]; - size_t filterWidth = inputs[1].shape()[3]; - size_t outputChannels = outputs[0].shape()[1]; - size_t outputHeight = outputs[0].shape()[2]; - size_t outputWidth = outputs[0].shape()[3]; + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; real* inputData = inputs[0].data(); real* filterData = inputs[1].data(); @@ -158,7 +158,8 @@ class GemmConvFunction : public ConvFunctionBase { size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth; size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; - size_t filterOffset = inputs[1].shape().getElements() / groups_; + size_t filterOffset = filter.getElements() / groups_; + for (size_t i = 0; i < batchSize; i++) { for (size_t g = 0; g < groups_; g++) { im2col(inputData + g * inputOffset, @@ -211,7 +212,9 @@ class GemmConvGradInputFunction : public ConvFunctionBase { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { CHECK_EQ(numInputs_, inputs.size()); CHECK_EQ(numOutputs_, outputs.size()); - // CHECK_EQ(outputs[0].getArgType(), ADD_TO); + // Since the implementation of Col2ImFunctor is ADD_TO, + // this function only supports ADD_TO mode. + CHECK_EQ(outputs[0].getArgType(), ADD_TO); const TensorShape& output = inputs[0].shape(); const TensorShape& filter = inputs[1].shape(); const TensorShape& input = outputs[0].shape(); @@ -221,8 +224,8 @@ class GemmConvGradInputFunction : public ConvFunctionBase { size_t inputChannels = input[1]; size_t inputHeight = input[2]; size_t inputWidth = input[3]; - size_t filterHeight = filter[2]; - size_t filterWidth = filter[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); size_t outputChannels = output[1]; size_t outputHeight = output[2]; size_t outputWidth = output[3]; @@ -311,8 +314,8 @@ class GemmConvGradFilterFunction : public ConvFunctionBase { size_t inputChannels = input[1]; size_t inputHeight = input[2]; size_t inputWidth = input[3]; - size_t filterHeight = filter[2]; - size_t filterWidth = filter[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); size_t outputChannels = output[1]; size_t outputHeight = output[2]; size_t outputWidth = output[3]; diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 379b2d339d47b..914689e66cdb8 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -80,8 +80,11 @@ void ExpandConvLayer::forward(PassType passType) { (size_t)imgSizeH_[i], (size_t)imgSizeW_[i]}); filterShape_[i] = - TensorShape({!isDeconv_ ? (size_t)numFilters_ : (size_t)channels_[i], - !isDeconv_ ? (size_t)channels_[i] : (size_t)numFilters_, + TensorShape({(size_t)groups_[i], + !isDeconv_ ? (size_t)numFilters_ / groups_[i] + : (size_t)channels_[i] / groups_[i], + !isDeconv_ ? (size_t)channels_[i] / groups_[i] + : (size_t)numFilters_ / groups_[i], (size_t)filterSizeY_[i], (size_t)filterSize_[i]}); outputShape_[i] = TensorShape({(size_t)batchSize, @@ -96,8 +99,9 @@ void ExpandConvLayer::forward(PassType passType) { BufferArgs outputs; inputs.addArg(*getInputValue(i), inputShape_[i]); inputs.addArg(*weights_[i]->getW(), filterShape_[i]); - outputs.addArg( - *getOutputValue(), outputShape_[i], i == 0 ? ASSIGN_TO : ADD_TO); + outputs.addArg(*getOutputValue(), + outputShape_[i], + !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO); forward_[i]->calc(inputs, outputs); } From 95a7bc018fa4c33fb314bd1b981a4ef918e4396a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 7 Jun 2017 18:00:39 +0800 Subject: [PATCH 18/24] follow comments --- paddle/function/ConvOpTest.cpp | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index 280aed8a5c8e4..59c7238d218a5 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { enum TestType { - FORWARD_TEST = 0, - BACKWARD_INPUT_TEST = 1, - BACKWARD_FILTER_TEST = 2, + kForwardTest = 0, + kBackwardInputTest = 1, + kBackwardFilterTest = 2, }; template @@ -43,16 +43,16 @@ class ConvolutionTest { if (padding >= filterSize) break; size_t outputSize = (inputSize - filterSize + 2 * padding + stride) / stride; - LOG(INFO) << " batchSize=" << batchSize - << " inputChannels=" << inputChannels - << " inputHeight=" << inputSize - << " inputWidth=" << inputSize - << " outputChannels=" << outputChannels - << " filterHeight=" << filterSize - << " filterWidth=" << filterSize - << " outputHeight=" << outputSize - << " outputWidth=" << outputSize - << " stride=" << stride << " padding=" << padding; + VLOG(3) << " batchSize=" << batchSize + << " inputChannels=" << inputChannels + << " inputHeight=" << inputSize + << " inputWidth=" << inputSize + << " outputChannels=" << outputChannels + << " filterHeight=" << filterSize + << " filterWidth=" << filterSize + << " outputHeight=" << outputSize + << " outputWidth=" << outputSize + << " stride=" << stride << " padding=" << padding; std::vector paddings = {padding, padding}; std::vector strides = {stride, stride}; @@ -72,17 +72,17 @@ class ConvolutionTest { TensorShape output{ batchSize, outputChannels, outputSize, outputSize}; - if (type == FORWARD_TEST) { + if (type == kForwardTest) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.run(); - } else if (type == BACKWARD_INPUT_TEST) { + } else if (type == kBackwardInputTest) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO); test.run(); - } else if (type == BACKWARD_FILTER_TEST) { + } else if (type == kBackwardFilterTest) { test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter)); @@ -100,23 +100,23 @@ class ConvolutionTest { TEST(Forward, GEMM) { ConvolutionTest test( - "NaiveConv-CPU", "GemmConv-CPU", FORWARD_TEST); + "NaiveConv-CPU", "GemmConv-CPU", kForwardTest); } #ifndef PADDLE_ONLY_CPU TEST(Forward, GEMM2) { ConvolutionTest test( - "GemmConv-CPU", "GemmConv-GPU", FORWARD_TEST); + "GemmConv-CPU", "GemmConv-GPU", kForwardTest); } TEST(BackwardInput, GEMM) { ConvolutionTest test( - "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", BACKWARD_INPUT_TEST); + "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest); } TEST(BackwardFilter, GEMM) { ConvolutionTest test( - "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", BACKWARD_FILTER_TEST); + "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest); } #endif From e039410eb74d24dced7c9e9108e717f0c4a0b7e4 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Thu, 8 Jun 2017 12:07:58 +0800 Subject: [PATCH 19/24] Remove the code of ExpandConvTransLayer. --- paddle/gserver/layers/ExpandConvBaseLayer.cpp | 193 ------------------ paddle/gserver/layers/ExpandConvBaseLayer.h | 28 --- .../gserver/layers/ExpandConvTransLayer.cpp | 90 -------- paddle/gserver/layers/ExpandConvTransLayer.h | 44 ---- paddle/gserver/tests/test_BatchNorm.cpp | 1 - paddle/gserver/tests/test_ConvTrans.cpp | 1 - paddle/gserver/tests/test_ConvUnify.cpp | 1 - 7 files changed, 358 deletions(-) delete mode 100644 paddle/gserver/layers/ExpandConvTransLayer.cpp delete mode 100644 paddle/gserver/layers/ExpandConvTransLayer.h diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index fdcf994cdb47f..77736e78f9349 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -22,26 +22,8 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap, /* Initialize the basic convolutional parent class */ ConvBaseLayer::init(layerMap, parameterMap); - /* The class fields channels_ and numFilters_ are the same as in the config - * i.e., channels_ is the for the input and numFilters_ is for the output - * - * But in order for the variables in convTrans having the same semantic - * meaning as in conv, we need to swap channels_ and numFilters here for - * convTrans, and in other functions too. - * */ - - /* Initialize the projection */ for (auto &inputConfig : config_.inputs()) { const ConvConfig &conf = inputConfig.conv_conf(); - int numFilters = isDeconv_ ? conf.channels() : numFilters_; - subM_.push_back(numFilters / conf.groups()); - subN_.push_back(conf.output_x() * - (conf.has_output_y() ? conf.output_y() : conf.output_x())); - int channel = isDeconv_ ? numFilters_ : conf.channels(); - subK_.push_back( - channel * conf.filter_size() * - (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) / - conf.groups()); /* Consistent caffe mode for multiple input */ caffeMode_ = conf.caffe_mode(); } @@ -54,17 +36,9 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap, size_t ExpandConvBaseLayer::getOutputSize() { CHECK_NE(inputLayers_.size(), 0UL); size_t layerSize = ConvBaseLayer::calOutputSize(); - subN_.clear(); - for (size_t i = 0; i < inputLayers_.size(); i++) { - subN_.push_back(outputH_[i] * outputW_[i]); - } return layerSize; } -void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) { - Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_); -} - void ExpandConvBaseLayer::addSharedBias() { size_t mapW = getOutputSize() / numFilters_; size_t mapH = getOutputValue()->getElementCnt() / mapW; @@ -101,173 +75,6 @@ void ExpandConvBaseLayer::addUnsharedBias() { outValue->addBias(*bias, 1.0f); } -void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, - size_t startIdx, - int inIdx) { - int channel = isDeconv_ ? numFilters_ : channels_[inIdx]; - - resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]); - - CHECK_EQ(image->getWidth(), - static_cast(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel)); - - real *imgData = image->getData() + startIdx * image->getWidth(); - MatrixPtr imageTmp = - Matrix::create(imgData, - 1, - imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, - false, - useGpu_); - expandInput_->convExpand(*imageTmp, - imgSizeH_[inIdx], - imgSizeW_[inIdx], - channel, - filterSizeY_[inIdx], - filterSize_[inIdx], - strideY_[inIdx], - stride_[inIdx], - paddingY_[inIdx], - padding_[inIdx], - outputH_[inIdx], - outputW_[inIdx]); - imageTmp->clear(); -} - -void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, - MatrixPtr out, - int inIdx, - int startIdx) { - int subM = subM_[inIdx]; - int subN = subN_[inIdx]; - int subK = subK_[inIdx]; - - expandOneFrame(image, startIdx, inIdx); - - int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_; - - real *outData = out->getData() + startIdx * subN * numFilters; - - real *wgtData = weights_[inIdx]->getW()->getData(); - real *expInData = expandInput_->getData(); - for (int g = 0; g < groups_[inIdx]; ++g) { - MatrixPtr A = - Matrix::create(wgtData, subM, subK, false, useGpu_); // mark transpose - MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_); - MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_); - C->mul(*A, *B, 1, 1); - - A->clear(); - B->clear(); - C->clear(); - wgtData += subK * subM; - expInData += subK * subN; - outData += subM * subN; - } -} - -void ExpandConvBaseLayer::bpropActs(MatrixPtr out, - MatrixPtr image, - int inpIdx) { - int channel = isDeconv_ ? numFilters_ : channels_[inpIdx]; - - int subM = subM_[inpIdx]; - int subN = subN_[inpIdx]; - int subK = subK_[inpIdx]; - size_t batchSize = image->getHeight(); - - /* reset the expand-grad memory */ - resetExpandInput(subK * groups_[inpIdx], subN); - - real *localGradData = out->getData(); - real *tgtGradData = image->getData(); - for (size_t n = 0; n < batchSize; n++) { - real *wgtData = weights_[inpIdx]->getW()->getData(); - real *expandInData = expandInput_->getData(); - - for (int g = 0; g < groups_[inpIdx]; g++) { - // create temporary matrix - MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_); - MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_); - MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_); - C->mul(*A, *B); // mul - - // clear the temporary matrix - A->clear(); - B->clear(); - C->clear(); - - expandInData += subK * subN; - localGradData += subM * subN; - wgtData += subK * subM; - } - - // shrink one frame outGrad - MatrixPtr oneGradTmp = Matrix::create( - expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_); - MatrixPtr vTmp = - Matrix::create(tgtGradData, - 1, - imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, - false, - useGpu_); - vTmp->convShrink(*oneGradTmp, - imgSizeH_[inpIdx], - imgSizeW_[inpIdx], - channel, - filterSizeY_[inpIdx], - filterSize_[inpIdx], - strideY_[inpIdx], - stride_[inpIdx], - paddingY_[inpIdx], - padding_[inpIdx], - outputH_[inpIdx], - outputW_[inpIdx], - 1.0f, - 1.0f); - vTmp->clear(); - oneGradTmp->clear(); - - // move the data-pointer - tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel; - } -} - -void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, - MatrixPtr out, - int inpIdx) { - MatrixPtr weightGrad = weights_[inpIdx]->getWGrad(); - - int subM = subM_[inpIdx]; - int subN = subN_[inpIdx]; - int subK = subK_[inpIdx]; - size_t batchSize = image->getHeight(); - resetExpandInput(subK * groups_[inpIdx], subN); - - real *gradData = out->getData(); - - for (size_t n = 0; n < batchSize; n++) { // frame by frame - // expand - expandOneFrame(image, n, inpIdx); - real *wGradData = weightGrad->getData(); - real *expandInData = expandInput_->getData(); - - // expand-mul one-group by one - for (int g = 0; g < groups_[inpIdx]; g++) { - MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_); - MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_); - MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_); - C->mul(*B, *A, 1, 1); - - A->clear(); - B->clear(); - C->clear(); - gradData += subM * subN; - wGradData += subK * subM; - expandInData += subK * subN; - } - } -} - void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) { size_t mapW = getOutputSize() / numFilters_; size_t mapH = v->getElementCnt() / mapW; diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h index aabcdfc392d3e..01c699d234444 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.h +++ b/paddle/gserver/layers/ExpandConvBaseLayer.h @@ -26,19 +26,6 @@ namespace paddle { */ class ExpandConvBaseLayer : public ConvBaseLayer { protected: - /// For expand convolution. - /// subM_ = numFilters_ / groups_. - IntV subM_; - /// subN_ = outputH_ * outputW_. - IntV subN_; - /// subK_ = channels_ * filterPixels_ * groups_. - IntV subK_; - - /*The expandInput_ and transOutValue_ are used for CPU expand conv calc - * Expand one sample at a time. shape: - * (numChannels * filterPixels_, outputSizeH * outputSizeW) - * */ - MatrixPtr expandInput_; /// The transpose of output, which is an auxiliary matrix. MatrixPtr transOutValue_; @@ -52,10 +39,6 @@ class ExpandConvBaseLayer : public ConvBaseLayer { const ParameterMap& parameterMap) override; size_t getOutputSize(); - /** - * Create or resize expandInput_. - */ - void resetExpandInput(size_t height, size_t width); /** * Add shared bias. @@ -66,20 +49,9 @@ class ExpandConvBaseLayer : public ConvBaseLayer { * Add unshared bias. */ void addUnsharedBias(); - /** - * Expand one input sample. - */ - void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx); - - /** - * Expand one input sample and perform matrix multiplication. - */ - void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx); void bpropSharedBias(MatrixPtr biases, MatrixPtr v); void bpropBiases(MatrixPtr v); - void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx); - void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx); }; } // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp deleted file mode 100644 index b80a01e328784..0000000000000 --- a/paddle/gserver/layers/ExpandConvTransLayer.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "ExpandConvTransLayer.h" -#include "paddle/utils/Logging.h" -#include "paddle/utils/Stat.h" - -/* The implementation of the convTransLayer is basically a swap of forward and - * backward of the original convLayer. - * The variable naming follows the convention of the convLayer. - * */ - -namespace paddle { - -// REGISTER_LAYER(exconvt, ExpandConvTransLayer); - -bool ExpandConvTransLayer::init(const LayerMap &layerMap, - const ParameterMap ¶meterMap) { - /* Initialize the basic convolutional parent class */ - ExpandConvBaseLayer::init(layerMap, parameterMap); - - return true; -} - -void ExpandConvTransLayer::forward(PassType passType) { - Layer::forward(passType); - - /* malloc memory for the output_ if necessary */ - int batchSize = inputLayers_[0]->getOutputValue()->getHeight(); - resetOutput(batchSize, getOutputSize()); - - MatrixPtr output = nullptr; - for (size_t i = 0; i < inputLayers_.size(); ++i) { - LayerPtr prevLayer = getPrev(i); - output = prevLayer->getOutputValue(); - REGISTER_TIMER_INFO("shrinkFwd", getName().c_str()); - bpropActs(output, getOutputValue(), i); - } - - /* add the bias-vector */ - if (biases_.get()) { - if (sharedBiases_) { - addSharedBias(); - } else { - addUnsharedBias(); - } - } - - /* activation */ - forwardActivation(); -} - -void ExpandConvTransLayer::backward(const UpdateCallback &callback) { - backwardActivation(); - - MatrixPtr imageGrad = getOutputGrad(); - if (biases_ && biases_->getWGrad()) { - bpropBiases(imageGrad); - /* Increasing the number of gradient */ - biases_->getParameterPtr()->incUpdate(callback); - } - - for (size_t i = 0; i < inputLayers_.size(); ++i) { - /* First, calculate the input layers error */ - for (size_t off = 0; off < imageGrad->getHeight(); off++) { - if (getPrev(i)->getOutputGrad()) { - expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off); - } - } - if (weights_[i]->getWGrad()) { - /* Then, calculate the W-gradient for the current layer */ - bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i); - /* Increasing the number of gradient */ - weights_[i]->getParameterPtr()->incUpdate(callback); - } - } -} - -} // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h deleted file mode 100644 index 00b8f241889fd..0000000000000 --- a/paddle/gserver/layers/ExpandConvTransLayer.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "ExpandConvBaseLayer.h" -#include "paddle/math/Matrix.h" - -namespace paddle { - -/** - * @brief A subclass of convolution layer. - * This layer expands input and use matrix multiplication to - * calculate convolution transpose (deconv) operation. - * - * The config file api is img_conv_layer with flag trans=True. - */ -class ExpandConvTransLayer : public ExpandConvBaseLayer { -public: - explicit ExpandConvTransLayer(const LayerConfig& config) - : ExpandConvBaseLayer(config) {} - - ~ExpandConvTransLayer() {} - - bool init(const LayerMap& layerMap, - const ParameterMap& parameterMap) override; - - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override; -}; - -} // namespace paddle diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp index d07299bfe3c41..83fcfed46cd56 100644 --- a/paddle/gserver/tests/test_BatchNorm.cpp +++ b/paddle/gserver/tests/test_BatchNorm.cpp @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "ModelConfig.pb.h" #include "paddle/gserver/layers/DataLayer.h" -#include "paddle/gserver/layers/ExpandConvTransLayer.h" #include "paddle/trainer/Trainer.h" #include "paddle/utils/GlobalConstants.h" diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp index 40bb1e2d73c81..6035a866b4eee 100644 --- a/paddle/gserver/tests/test_ConvTrans.cpp +++ b/paddle/gserver/tests/test_ConvTrans.cpp @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "ModelConfig.pb.h" #include "paddle/gserver/layers/DataLayer.h" -#include "paddle/gserver/layers/ExpandConvTransLayer.h" #include "paddle/math/MathUtils.h" #include "paddle/trainer/Trainer.h" #include "paddle/utils/GlobalConstants.h" diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp index 54b72375b743f..e7325e0cc3b71 100644 --- a/paddle/gserver/tests/test_ConvUnify.cpp +++ b/paddle/gserver/tests/test_ConvUnify.cpp @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "ModelConfig.pb.h" #include "paddle/gserver/layers/DataLayer.h" -#include "paddle/gserver/layers/ExpandConvTransLayer.h" #include "paddle/math/MathUtils.h" #include "paddle/trainer/Trainer.h" #include "paddle/utils/GlobalConstants.h" From 01d52ebf167029a165809736d78879802168152b Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 11:05:06 +0800 Subject: [PATCH 20/24] Fix RowConvOpTest use CpuGpuFuncCompare. --- paddle/function/RowConvOpTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp index 1c95d3ff2cccb..f52d18b0491ec 100644 --- a/paddle/function/RowConvOpTest.cpp +++ b/paddle/function/RowConvOpTest.cpp @@ -18,7 +18,7 @@ limitations under the License. */ namespace paddle { void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) { - FunctionCompare test("RowConv", FuncConfig()); + CpuGpuFuncCompare test("RowConv", FuncConfig()); test.addSequence(SequenceIdArg(TensorShape{batchSize})); test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); @@ -31,7 +31,7 @@ void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) { } void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) { - FunctionCompare test("RowConvGrad", FuncConfig()); + CpuGpuFuncCompare test("RowConvGrad", FuncConfig()); test.addSequence(SequenceIdArg(TensorShape{batchSize})); test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); From 2608c4854273e47bd0958fbb03ca67050ddfb35c Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 13 Jun 2017 11:30:36 +0800 Subject: [PATCH 21/24] Add test cases where the height and width (input, filter) are not equal. --- paddle/function/ConvOpTest.cpp | 87 +++++++++++++++++++++++++++++++++ paddle/function/NaiveConvOp.cpp | 2 +- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp index 59c7238d218a5..dfa2f784610b0 100644 --- a/paddle/function/ConvOpTest.cpp +++ b/paddle/function/ConvOpTest.cpp @@ -98,25 +98,112 @@ class ConvolutionTest { } }; +// Mainly used to test cases where the height and width (input, filter) +// are not equal. +template +class ConvolutionTest2 { +public: + ConvolutionTest2(const std::string& conv1, + const std::string& conv2, + TestType type, + std::string algo = "auto") { + for (size_t batchSize : {16}) { + for (size_t inputHeight : {7, 31}) { + for (size_t inputWidth : {10, 54}) { + for (size_t filterHeight : {1, 5}) { + for (size_t filterWidth : {3, 7}) { + for (size_t inputChannels : {7}) { + for (size_t outputChannels : {32}) { + size_t stride = 1; + size_t padding = 0; + size_t outputHeight = + (inputHeight - filterHeight + 2 * padding + stride) / + stride; + size_t outputWidth = + (inputWidth - filterWidth + 2 * padding + stride) / + stride; + VLOG(3) << " batchSize=" << batchSize + << " inputChannels=" << inputChannels + << " inputHeight=" << inputHeight + << " inputWidth=" << inputWidth + << " outputChannels=" << outputChannels + << " filterHeight=" << filterHeight + << " filterWidth=" << filterWidth + << " outputHeight=" << outputHeight + << " outputWidth=" << outputWidth + << " stride=" << stride << " padding=" << padding; + + std::vector paddings = {padding, padding}; + std::vector strides = {stride, stride}; + Compare2Function test( + conv1, + conv2, + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)1) + .set("algo", algo)); + + TensorShape input{ + batchSize, inputChannels, inputHeight, inputWidth}; + TensorShape filter{ + outputChannels, inputChannels, filterHeight, filterWidth}; + TensorShape output{ + batchSize, outputChannels, outputHeight, outputWidth}; + + if (type == kForwardTest) { + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.run(); + } else if (type == kBackwardInputTest) { + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO); + test.run(); + } else if (type == kBackwardFilterTest) { + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter)); + test.run(); + } + } + } + } + } + } + } + } + } +}; + TEST(Forward, GEMM) { ConvolutionTest test( "NaiveConv-CPU", "GemmConv-CPU", kForwardTest); + ConvolutionTest2 test2( + "NaiveConv-CPU", "GemmConv-CPU", kForwardTest); } #ifndef PADDLE_ONLY_CPU TEST(Forward, GEMM2) { ConvolutionTest test( "GemmConv-CPU", "GemmConv-GPU", kForwardTest); + ConvolutionTest2 test2( + "GemmConv-CPU", "GemmConv-GPU", kForwardTest); } TEST(BackwardInput, GEMM) { ConvolutionTest test( "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest); + ConvolutionTest2 test2( + "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest); } TEST(BackwardFilter, GEMM) { ConvolutionTest test( "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest); + ConvolutionTest2 test2( + "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest); } #endif diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp index 94aba253e3e56..1d204f99e0e12 100644 --- a/paddle/function/NaiveConvOp.cpp +++ b/paddle/function/NaiveConvOp.cpp @@ -104,7 +104,7 @@ class NaiveConvFunction : public ConvFunctionBase { size_t inputHeight = inputs[0].shape()[2]; size_t inputWidth = inputs[0].shape()[3]; size_t filterHeight = inputs[1].shape()[2]; - size_t filterWidth = inputs[1].shape()[2]; + size_t filterWidth = inputs[1].shape()[3]; size_t outputChannels = outputs[0].shape()[1]; size_t outputHeight = outputs[0].shape()[2]; size_t outputWidth = outputs[0].shape()[3]; From c6e010d0ddaab9d00705c1ce890ede9956559c9f Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 16 Jun 2017 23:06:43 +0800 Subject: [PATCH 22/24] Follow comments. --- paddle/function/ConvOp.h | 12 ++---------- paddle/function/FunctionTest.h | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 2cfc0712557e5..92733bc1b7615 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -104,19 +104,11 @@ class ConvFunctionBase : public FunctionBase { protected: size_t getFilterHeight(const TensorShape& filter) const { - if (filter.ndims() == 5) { - return filter[3]; - } else { - return filter[2]; - } + filter[filter.ndims() - 2]; } size_t getFilterWidth(const TensorShape& filter) const { - if (filter.ndims() == 5) { - return filter[4]; - } else { - return filter[3]; - } + filter[filter.ndims() - 1]; } std::vector strides_; diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h index 69ab33052dadc..ba446bf92da26 100644 --- a/paddle/function/FunctionTest.h +++ b/paddle/function/FunctionTest.h @@ -296,9 +296,9 @@ class Compare2Function { compareOutputs(); } - std::shared_ptr getCpuFunction() const { return function1_; } + std::shared_ptr getFunction1() const { return function1_; } - std::shared_ptr getGpuFunction() const { return function2_; } + std::shared_ptr getFunction2() const { return function2_; } protected: // only init cpu argument, gpu argument copy from cpu argument. From 1ed31b4e4e45db3c9c0925cdb5169c5e24bde0af Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 16 Jun 2017 23:17:32 +0800 Subject: [PATCH 23/24] Bug fix. --- paddle/function/ConvOp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index 92733bc1b7615..b87808915ef2c 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -104,11 +104,11 @@ class ConvFunctionBase : public FunctionBase { protected: size_t getFilterHeight(const TensorShape& filter) const { - filter[filter.ndims() - 2]; + return filter[filter.ndims() - 2]; } size_t getFilterWidth(const TensorShape& filter) const { - filter[filter.ndims() - 1]; + return filter[filter.ndims() - 1]; } std::vector strides_; From 9c47c42ad1953e2fbe89ff234c3d35494b5b751a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 19 Jun 2017 15:03:01 +0800 Subject: [PATCH 24/24] Change the groups in the comment to 1049089. --- paddle/function/ConvOp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h index b87808915ef2c..65b9d1d53f921 100644 --- a/paddle/function/ConvOp.h +++ b/paddle/function/ConvOp.h @@ -50,9 +50,9 @@ namespace paddle { * image channels, C is the number of input image channels, * H and W is height and width of filter. * - * If groups is greater than 1, the filter's data format should be GMCHW, - * where G is the groups, and G * M is the number of output image channels, - * G * C is the number of input image channels, + * If `groups` is greater than 1, the filter's data format should be GMCHW, + * where G is the `groups`, and G * M is the number of output image + * channels, G * C is the number of input image channels, * H and W is height and width of filter. */ class ConvFunctionBase : public FunctionBase {