From 0ab22f888a804183505581c5dbbd9458cc6146e6 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 6 Jul 2017 16:55:06 +0800 Subject: [PATCH] add Region Proposal Network for Faster R-CNN, including AnchorLayer, RPNLossLayer and ProposalLayer --- paddle/gserver/layers/AnchorLayer.cpp | 157 +++++ paddle/gserver/layers/AnchorLayer.h | 62 ++ paddle/gserver/layers/ProposalLayer.cpp | 353 +++++++++++ paddle/gserver/layers/ProposalLayer.h | 138 ++++ paddle/gserver/layers/RPNLossLayer.cpp | 589 ++++++++++++++++++ paddle/gserver/layers/RPNLossLayer.h | 142 +++++ paddle/gserver/tests/CMakeLists.txt | 7 + paddle/gserver/tests/test_LayerGrad.cpp | 69 ++ paddle/gserver/tests/test_Proposal.cpp | 163 +++++ proto/ModelConfig.proto | 32 + python/paddle/trainer/config_parser.py | 57 ++ .../paddle/trainer_config_helpers/layers.py | 153 +++++ 12 files changed, 1922 insertions(+) create mode 100644 paddle/gserver/layers/AnchorLayer.cpp create mode 100644 paddle/gserver/layers/AnchorLayer.h create mode 100644 paddle/gserver/layers/ProposalLayer.cpp create mode 100644 paddle/gserver/layers/ProposalLayer.h create mode 100644 paddle/gserver/layers/RPNLossLayer.cpp create mode 100644 paddle/gserver/layers/RPNLossLayer.h create mode 100644 paddle/gserver/tests/test_Proposal.cpp diff --git a/paddle/gserver/layers/AnchorLayer.cpp b/paddle/gserver/layers/AnchorLayer.cpp new file mode 100644 index 0000000000000..78b9245a5ffc4 --- /dev/null +++ b/paddle/gserver/layers/AnchorLayer.cpp @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "AnchorLayer.h" +#include + +namespace paddle { + +REGISTER_LAYER(anchor, AnchorLayer); + +bool AnchorLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + const AnchorConfig& anchorConf = config_.inputs(0).anchor_conf(); + std::copy(anchorConf.scale_ratio().begin(), + anchorConf.scale_ratio().end(), + std::back_inserter(anchorScales_)); + std::copy(anchorConf.aspect_ratio().begin(), + anchorConf.aspect_ratio().end(), + std::back_inserter(anchorRatios_)); + baseSize_ = anchorConf.base_size(); + featStrideX_ = anchorConf.feat_stride_x(); + featStrideY_ = anchorConf.feat_stride_y(); + allowedBorder_ = 0; + return true; +} + +void AnchorLayer::generateBaseAnchors() { + std::vector baseAnchor{ + 0, 0, static_cast(baseSize_ - 1), static_cast(baseSize_ - 1)}; + std::vector> ratioAnchors = enumRatio(baseAnchor); + for (size_t i = 0; i < ratioAnchors.size(); ++i) { + std::vector> tmpAnchors = enumScale(ratioAnchors[i]); + anchors_.insert(anchors_.end(), tmpAnchors.begin(), tmpAnchors.end()); + } +} + +std::vector> AnchorLayer::enumRatio( + const std::vector& anchor) { + std::vector> ratioAnchors; + std::vector whctr = anchor2whctr(anchor); + real ctrX = whctr[2]; + real ctrY = whctr[3]; + real size = whctr[0] * whctr[1]; + for (size_t i = 0; i < anchorRatios_.size(); ++i) { + real ratioSize = size / anchorRatios_[i]; + real ratioW = std::round(std::sqrt(ratioSize)); + real ratioH = std::round(ratioW * anchorRatios_[i]); + ratioAnchors.push_back(whctr2anchor(ratioW, ratioH, ctrX, ctrY)); + } + return ratioAnchors; +} + +std::vector> AnchorLayer::enumScale( + const std::vector& anchor) { + std::vector> scaleAnchors; + std::vector whctr = anchor2whctr(anchor); + real w = whctr[0]; + real h = whctr[1]; + real ctrX = whctr[2]; + real ctrY = whctr[3]; + for (size_t i = 0; i < anchorScales_.size(); ++i) { + real scaleW = w * anchorScales_[i]; + real scaleH = h * anchorScales_[i]; + scaleAnchors.push_back(whctr2anchor(scaleW, scaleH, ctrX, ctrY)); + } + return scaleAnchors; +} + +std::vector AnchorLayer::anchor2whctr(const std::vector& anchor) { + std::vector whctr; + whctr.push_back(anchor[2] - anchor[0] + 1); // w + whctr.push_back(anchor[3] - anchor[1] + 1); // h + whctr.push_back((anchor[2] + anchor[0]) / 2); // ctrX + whctr.push_back((anchor[3] + anchor[1]) / 2); // ctrY + return whctr; +} + +std::vector AnchorLayer::whctr2anchor(real w, + real h, + real ctrX, + real ctrY) { + std::vector anchor; + anchor.push_back(ctrX - 0.5 * (w - 1)); + anchor.push_back(ctrY - 0.5 * (h - 1)); + anchor.push_back(ctrX + 0.5 * (w - 1)); + anchor.push_back(ctrY + 0.5 * (h - 1)); + return anchor; +} + +void AnchorLayer::generateAllAnchors(size_t layerHeight, + size_t layerWidth, + size_t imageHeight, + size_t imageWidth) { + auto* tmpPtr = getOutputValue()->getData(); + if (featStrideX_ == 0) + featStrideX_ = static_cast(imageWidth) / layerWidth; + if (featStrideY_ == 0) + featStrideY_ = static_cast(imageHeight) / layerHeight; + size_t idx = 0; + for (size_t h = 0; h < layerHeight; ++h) { + for (size_t w = 0; w < layerWidth; ++w) { + for (size_t i = 0; i < anchors_.size(); ++i) { + // xmin, ymin, xmax, ymax, overflow_flag, img_width, img_height. + tmpPtr[idx++] = anchors_[i][0] + h * featStrideX_; + tmpPtr[idx++] = anchors_[i][1] + w * featStrideY_; + tmpPtr[idx++] = anchors_[i][2] + h * featStrideX_; + tmpPtr[idx++] = anchors_[i][3] + w * featStrideY_; + if (tmpPtr[idx - 4] + allowedBorder_ >= 0 && + tmpPtr[idx - 3] + allowedBorder_ >= 0 && + tmpPtr[idx - 2] < imageWidth + allowedBorder_ && + tmpPtr[idx - 1] < + imageHeight + allowedBorder_) { // keep only inside anchors + tmpPtr[idx++] = 1; + } else { + tmpPtr[idx++] = -1; + } + tmpPtr[idx++] = imageWidth; + tmpPtr[idx++] = + imageHeight; // to be used in proposal generation for box cliping + } + } + } +} + +void AnchorLayer::forward(PassType passType) { + Layer::forward(passType); + + auto featMap = getInput(0); + size_t layerWidth = featMap.getFrameWidth(); + size_t layerHeight = featMap.getFrameHeight(); + + auto image = getInput(1); + size_t imageWidth = image.getFrameWidth(); + size_t imageHeight = image.getFrameHeight(); + + int dim = layerHeight * layerWidth * anchorScales_.size() * + anchorRatios_.size() * 5; + reserveOutput(1, dim); + + generateBaseAnchors(); + generateAllAnchors(layerHeight, layerWidth, imageHeight, imageWidth); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/AnchorLayer.h b/paddle/gserver/layers/AnchorLayer.h new file mode 100644 index 0000000000000..86b786793fc99 --- /dev/null +++ b/paddle/gserver/layers/AnchorLayer.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { +/** + * @brief A layer used by Faster R-CNN to generate anchor-box locations. + * - Input: Two and only two input layer are accepted. The input layer must be + * be a data output layer and a convolution output layer. + * - Output: The anchor-box locations of the input data. + * Reference: + * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. + * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal + * Networks + */ + +class AnchorLayer : public Layer { +public: + explicit AnchorLayer(const LayerConfig& config) : Layer(config) {} + ~AnchorLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback) override {} + +protected: + size_t baseSize_; + size_t featStrideX_; + size_t featStrideY_; + size_t allowedBorder_; + std::vector anchorScales_; + std::vector anchorRatios_; + std::vector> anchors_; + + void generateBaseAnchors(); + std::vector> enumRatio(const std::vector& anchor); + std::vector> enumScale(const std::vector& anchor); + std::vector anchor2whctr(const std::vector& anchor); + std::vector whctr2anchor(real w, real h, real ctrX, real ctrY); + void generateAllAnchors(size_t layerHeight, + size_t layerWidth, + size_t imageHeight, + size_t imageWidth); +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/ProposalLayer.cpp b/paddle/gserver/layers/ProposalLayer.cpp new file mode 100644 index 0000000000000..555343e330dbb --- /dev/null +++ b/paddle/gserver/layers/ProposalLayer.cpp @@ -0,0 +1,353 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ProposalLayer.h" + +namespace paddle { + +REGISTER_LAYER(proposal, ProposalLayer); + +bool ProposalLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + auto& layerConf = config_.inputs(0).proposal_conf(); + nmsThreshold_ = layerConf.nms_threshold(); + confidenceThreshold_ = layerConf.confidence_threshold(); + nmsTopK_ = layerConf.nms_top_k(); + keepTopK_ = layerConf.keep_top_k(); + minWidth_ = layerConf.min_width(); + minHeight_ = layerConf.min_height(); + numClasses_ = 2; + inputNum_ = 1; + backgroundId_ = 0; + return true; +} + +real ProposalLayer::jaccardOverlap(const UnnormalizedBBox& bbox1, + const UnnormalizedBBox& bbox2) { + if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin || + bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) { + return 0.0; + } else { + real interXMin = std::max(bbox1.xMin, bbox2.xMin); + real interYMin = std::max(bbox1.yMin, bbox2.yMin); + real interXMax = std::min(bbox1.xMax, bbox2.xMax); + real interYMax = std::min(bbox1.yMax, bbox2.yMax); + + real interWidth = interXMax - interXMin + 1; + real interHeight = interYMax - interYMin + 1; + real interArea = interWidth * interHeight; + + real bboxArea1 = bbox1.getArea(); + real bboxArea2 = bbox2.getArea(); + + return interArea / (bboxArea1 + bboxArea2 - interArea); + } +} + +void ProposalLayer::applyNMSFast(const vector& bboxes, + const real* confScoreData, + size_t classIdx, + size_t topK, + real confThreshold, + real nmsThreshold, + real minWidth, + real minHeight, + size_t numPriorBBoxes, + size_t numClasses, + vector* indices) { + vector> scores; + for (size_t i = 0; i < numPriorBBoxes; ++i) { + if (bboxes[i].getWidth() < minWidth || bboxes[i].getHeight() < minHeight) { + continue; // remove predicted boxes with either height or width < + // threshold + } + size_t confOffset = i * numClasses + classIdx; + if (confScoreData[confOffset] > confThreshold) + scores.push_back(std::make_pair(confScoreData[confOffset], i)); + } + std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend); + if (topK > 0 && topK < scores.size()) scores.resize(topK); + while (scores.size() > 0) { + const size_t idx = scores.front().second; + bool keep = true; + for (size_t i = 0; i < indices->size(); ++i) { + if (keep) { + const size_t savedIdx = (*indices)[i]; + real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]); + keep = overlap <= nmsThreshold; + } else { + break; + } + } + if (keep) indices->push_back(idx); + scores.erase(scores.begin()); + } +} + +size_t ProposalLayer::getDetectionIndices( + const real* confData, + const size_t numPriorBBoxes, + const size_t numClasses, + const size_t backgroundId, + const size_t batchSize, + const size_t confThreshold, + const size_t nmsTopK, + const real nmsThreshold, + const size_t keepTopK, + const real minWidth, + const real minHeight, + const vector>& allDecodedBBoxes, + vector>>* allDetectionIndices) { + size_t totalKeepNum = 0; + for (size_t n = 0; n < batchSize; ++n) { + const vector& decodedBBoxes = allDecodedBBoxes[n]; + size_t numDetected = 0; + map> indices; + size_t confOffset = n * numPriorBBoxes * numClasses; + for (size_t c = 0; c < numClasses; ++c) { + if (c == backgroundId) continue; + applyNMSFast(decodedBBoxes, + confData + confOffset, + c, + nmsTopK, + confThreshold, + nmsThreshold, + minWidth, + minHeight, + numPriorBBoxes, + numClasses, + &(indices[c])); + numDetected += indices[c].size(); + } + if (keepTopK > 0 && numDetected > keepTopK) { + vector>> scoreIndexPairs; + for (size_t c = 0; c < numClasses; ++c) { + const vector& labelIndices = indices[c]; + for (size_t i = 0; i < labelIndices.size(); ++i) { + size_t idx = labelIndices[i]; + scoreIndexPairs.push_back( + std::make_pair((confData + confOffset)[idx * numClasses + c], + std::make_pair(c, idx))); + } + } + std::sort(scoreIndexPairs.begin(), + scoreIndexPairs.end(), + sortScorePairDescend>); + scoreIndexPairs.resize(keepTopK); + map> newIndices; + for (size_t i = 0; i < scoreIndexPairs.size(); ++i) { + size_t label = scoreIndexPairs[i].second.first; + size_t idx = scoreIndexPairs[i].second.second; + newIndices[label].push_back(idx); + } + allDetectionIndices->push_back(newIndices); + totalKeepNum += keepTopK; + } else { + allDetectionIndices->push_back(indices); + totalKeepNum += numDetected; + } + } + return totalKeepNum; +} + +void ProposalLayer::getDetectionOutput( + const real* confData, + const size_t numKept, + const size_t numPriorBBoxes, + const size_t numClasses, + const size_t batchSize, + const vector>>& allIndices, + const vector>& allDecodedBBoxes, + Matrix& out) { + MatrixPtr outBuffer; + Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false); + real* bufferData = outBuffer->getData(); + size_t count = 0; + for (size_t n = 0; n < batchSize; ++n) { + for (map>::const_iterator it = allIndices[n].begin(); + it != allIndices[n].end(); + ++it) { + size_t label = it->first; + const vector& indices = it->second; + const vector& decodedBBoxes = allDecodedBBoxes[n]; + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses; + bufferData[count * 7] = n; + bufferData[count * 7 + 1] = label; + bufferData[count * 7 + 2] = (confData + confOffset)[label]; + bufferData[count * 7 + 3] = decodedBBoxes[idx].xMin; + bufferData[count * 7 + 4] = decodedBBoxes[idx].yMin; + bufferData[count * 7 + 5] = decodedBBoxes[idx].xMax; + bufferData[count * 7 + 6] = decodedBBoxes[idx].yMax; + ++count; + } + } + } + out.copyFrom(bufferData, numKept * 7); +} + +void ProposalLayer::decodeTarget(const std::vector& anchorBoxData, + const std::vector& locPredData, + UnnormalizedBBox& predBox) { + real anchorBoxWidth = anchorBoxData[2] - anchorBoxData[0] + 1; + real anchorBoxHeight = anchorBoxData[3] - anchorBoxData[1] + 1; + real anchorBoxCenterX = (anchorBoxData[2] + anchorBoxData[0]) / 2; + real anchorBoxCenterY = (anchorBoxData[3] + anchorBoxData[1]) / 2; + + real dx = locPredData[0]; + real dy = locPredData[1]; + real dw = locPredData[2]; + real dh = locPredData[3]; + + real predCtrX = dx * anchorBoxWidth + anchorBoxCenterX; + real predCtrY = dy * anchorBoxHeight + anchorBoxCenterY; + real predWidth = std::exp(dw * anchorBoxWidth); + real predHeight = std::exp(dh * anchorBoxHeight); + + // clip predicted box to image + real xMin = static_cast(0.); + real yMin = static_cast(0.); + real xMax = anchorBoxData[5] - 1; + real yMax = anchorBoxData[6] - 1; + predBox.xMin = std::min( + std::max(static_cast(predCtrX - 0.5 * predWidth), xMin), xMax); + predBox.yMin = std::min( + std::max(static_cast(predCtrY - 0.5 * predHeight), yMin), yMax); + predBox.xMax = std::min( + std::max(static_cast(predCtrX + 0.5 * predWidth), xMin), xMax); + predBox.yMax = std::min( + std::max(static_cast(predCtrY + 0.5 * predHeight), yMin), yMax); +} + +void ProposalLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + Matrix::resizeOrCreate( + confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_); + + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).proposal_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locTmpBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confTmpBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + MatrixPtr priorValue; + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate( + confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + } + confBuffer_->softmax(*confBuffer_); + + size_t numPriors = priorValue->getElementCnt() / 7; + std::vector> allDecodedBBoxes; + for (size_t n = 0; n < batchSize; ++n) { + std::vector decodedBBoxes; + for (size_t i = 0; i < numPriors; ++i) { + size_t priorOffset = i * 7; + std::vector anchorBoxData; + for (size_t j = 0; j < 7; ++j) + anchorBoxData.push_back(*(priorValue->getData() + priorOffset + j)); + size_t locPredOffset = n * numPriors * 4 + i * 4; + std::vector locPredData; + for (size_t j = 0; j < 4; ++j) + locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j)); + UnnormalizedBBox bbox; + decodeTarget(anchorBoxData, locPredData, bbox); + decodedBBoxes.push_back(bbox); + } + allDecodedBBoxes.push_back(decodedBBoxes); + } + + std::vector>> allIndices; + size_t numKept = getDetectionIndices(confBuffer_->getData(), + numPriors, + numClasses_, + backgroundId_, + batchSize, + confidenceThreshold_, + nmsTopK_, + nmsThreshold_, + keepTopK_, + minWidth_, + minHeight_, + allDecodedBBoxes, + &allIndices); + + resetOutput(numKept, 7); + MatrixPtr outV = getOutputValue(); + getDetectionOutput(confBuffer_->getData(), + numKept, + numPriors, + numClasses_, + batchSize, + allIndices, + allDecodedBBoxes, + *outV); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/ProposalLayer.h b/paddle/gserver/layers/ProposalLayer.h new file mode 100644 index 0000000000000..d41e888fcc56c --- /dev/null +++ b/paddle/gserver/layers/ProposalLayer.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::pair; +using std::map; + +namespace paddle { + +/** + * The detection output layer to generate proposals in RPN of Faster R-CNN. + * This layer applies Non-maximum suppression to the all predicted bounding + * box and keeps the Top-K bounding boxes. + * - Input: This layer needs three input layers: The first input layer + * is the anchor layer. The rest two input layers are convolution + * layers for generating bbox location offset and the classification + * confidence. + * - Output: The predict bounding box locations. + * Reference: + * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. + * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal + */ + +class ProposalLayer : public Layer { +public: + explicit ProposalLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[1 + index]; + } + + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[1 + inputNum_ + index]; + } + + struct UnnormalizedBBox : BBoxBase { + UnnormalizedBBox() : BBoxBase() {} + real getWidth() const { return xMax - xMin + 1; } + real getHeight() const { return yMax - yMin + 1; } + }; + + real jaccardOverlap(const UnnormalizedBBox& bbox1, + const UnnormalizedBBox& bbox2); + + void applyNMSFast(const vector& bboxes, + const real* confScoreData, + size_t classIdx, + size_t topK, + real confThreshold, + real nmsThreshold, + real minWidth, + real minHeight, + size_t numPriorBBoxes, + size_t numClasses, + vector* indices); + + size_t getDetectionIndices( + const real* confData, + const size_t numPriorBBoxes, + const size_t numClasses, + const size_t backgroundId, + const size_t batchSize, + const size_t confThreshold, + const size_t nmsTopK, + const real nmsThreshold, + const size_t keepTopK, + const real minWidth, + const real minHeight, + const vector>& allDecodedBBoxes, + vector>>* allDetectionIndices); + + void getDetectionOutput( + const real* confData, + const size_t numKept, + const size_t numPriorBBoxes, + const size_t numClasses, + const size_t batchSize, + const vector>>& allIndices, + const vector>& allDecodedBBoxes, + Matrix& out); + + void decodeTarget(const std::vector& anchorBoxData, + const std::vector& locPredData, + UnnormalizedBBox& predBox); + +private: + real nmsThreshold_; + real confidenceThreshold_; + size_t nmsTopK_; + size_t keepTopK_; + real minWidth_; + real minHeight_; + + size_t numClasses_; + size_t inputNum_; + size_t backgroundId_; + + size_t locSizeSum_; + size_t confSizeSum_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/RPNLossLayer.cpp b/paddle/gserver/layers/RPNLossLayer.cpp new file mode 100644 index 0000000000000..7ea4d80e84472 --- /dev/null +++ b/paddle/gserver/layers/RPNLossLayer.cpp @@ -0,0 +1,589 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "RPNLossLayer.h" +#include +#include +#include "DataLayer.h" +#include "paddle/utils/ThreadLocal.h" + +namespace paddle { + +REGISTER_LAYER(rpn_loss, RPNLossLayer); + +bool RPNLossLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + auto layerConf = config_.inputs(0).rpn_loss_conf(); + posOverlapThreshold_ = layerConf.pos_overlap_threshold(); + negOverlapThreshold_ = layerConf.neg_overlap_threshold(); + rpnBatchSize_ = layerConf.rpn_batch_size(); + rpnFgRatio_ = layerConf.rpn_fg_ratio(); + lossRatio_ = layerConf.loss_ratio(); + numClasses_ = 2; + inputNum_ = 1; + backgroundId_ = 0; + + return true; +} + +void RPNLossLayer::bboxOverlaps( + const std::vector>& anchorBoxes, + const std::vector>& gtBBoxes, + std::vector& overlaps) { + for (size_t i = 0; i < anchorBoxes.size(); ++i) { + if (anchorBoxes[i][4] == -1) { + continue; // out of the image, keep only inside anchors + } + for (size_t j = 0; j < gtBBoxes.size(); ++j) { + real width = std::min(anchorBoxes[i][2], gtBBoxes[j][2]) - + std::max(anchorBoxes[i][0], gtBBoxes[j][0]) + 1; + real height = std::min(anchorBoxes[i][3], gtBBoxes[j][3]) - + std::max(anchorBoxes[i][1], gtBBoxes[j][1]) + 1; + if (width > 0 && height > 0) { + real gtboxArea = (gtBBoxes[j][2] - gtBBoxes[j][0] + 1) * + (gtBBoxes[j][3] - gtBBoxes[j][1] + 1); + real anchorArea = (anchorBoxes[i][2] - anchorBoxes[i][0] + 1) * + (anchorBoxes[i][3] - anchorBoxes[i][1] + 1); + real overlapArea = width * height; + overlaps[i * gtBBoxes.size() + j] = + overlapArea / (gtboxArea + anchorArea - overlapArea); + } + } + } +} + +std::pair RPNLossLayer::labelAnchors( + const std::vector>& anchorBoxes, + const std::vector>& gtBBoxes, + const std::vector& overlaps, + const real posOverlapThreshold, + const real negOverlapThreshold, + std::vector& matchIndices, + std::vector& labels) { + size_t numPos = 0; + size_t numNeg = 0; + std::vector gtBBoxMaxIdxs( + gtBBoxes.size(), -1); // anchor index with max overlap of each gtBBox + for (size_t n = 0; n < overlaps.size(); ++n) { + size_t anchorIdx = n / gtBBoxes.size(); + size_t gtBBoxIdx = n % gtBBoxes.size(); + if (matchIndices[anchorIdx] == -1 || + overlaps[n] > + overlaps[anchorIdx * gtBBoxes.size() + matchIndices[anchorIdx]]) { + matchIndices[anchorIdx] = gtBBoxIdx; // overlaps.argmax(axis=1) + } + if (gtBBoxMaxIdxs[gtBBoxIdx] == -1 || + overlaps[n] > + overlaps[gtBBoxMaxIdxs[gtBBoxIdx] * gtBBoxes.size() + gtBBoxIdx]) { + gtBBoxMaxIdxs[gtBBoxIdx] = anchorIdx; // overlaps.argmax(axis=0) + } + } + for (size_t n = 0; n < gtBBoxMaxIdxs.size(); + ++n) { // fg label: anchor with highest overlap for each gtBBox + if (overlaps[gtBBoxMaxIdxs[n] * gtBBoxes.size() + n] > 0) { + labels[gtBBoxMaxIdxs[n]] = 1; + } + } + for (size_t n = 0; n < anchorBoxes.size(); + ++n) { // fg/bg/disabled label: above/below threshold IOU + if (overlaps[n * gtBBoxes.size() + matchIndices[n]] >= + posOverlapThreshold) { + labels[n] = 1; + } else if (overlaps[n * gtBBoxes.size() + matchIndices[n]] <= + negOverlapThreshold) { + if (overlaps[n * gtBBoxes.size() + matchIndices[n]] < 0) { + labels[n] = -1; // out of the image + } else { + labels[n] = 0; + } + } + } + for (size_t n = 0; n < labels.size(); ++n) { + if (labels[n] == 1) { + ++numPos; + } else if (labels[n] == 0) { + ++numNeg; + } + } + return std::make_pair(numPos, numNeg); +} + +template +void RPNLossLayer::sampleAnchors( + std::vector& allLabels, T label, T disabledLable, size_t m, size_t n) { + auto& randEngine = ThreadLocalRandomEngine::get(); + for (size_t i = 0; i < allLabels.size(); ++i) { + if (allLabels[i] == label) { + if (rand_(randEngine) * n < m) { + --m; + } else { + allLabels[i] = disabledLable; + } + --n; + } + } +} + +pair RPNLossLayer::generateMatchIndices( + const Matrix& priorValue, + const size_t numPriorBBoxes, + const Matrix& gtValue, + const int* gtStartPosPtr, + const size_t seqNum, + const size_t batchSize, + const real posOverlapThreshold, + const real negOverlapThreshold, + const size_t boxBatchSize, + const real boxFgRatio, + std::vector>* matchIndicesVecPtr, + std::vector>* negIndicesVecPtr) { + size_t totalPos = 0; + size_t totalNeg = 0; + std::vector allLabels; + std::vector allTargets; + + std::vector> anchorBoxes; + const real* priorData = priorValue.getData(); + for (size_t n = 0; n < numPriorBBoxes; ++n) { + std::vector anchorBox; + anchorBox.push_back(*(priorData + n * 7 + 0)); + anchorBox.push_back(*(priorData + n * 7 + 1)); + anchorBox.push_back(*(priorData + n * 7 + 2)); + anchorBox.push_back(*(priorData + n * 7 + 3)); + anchorBox.push_back(*(priorData + n * 7 + 4)); + anchorBoxes.push_back(anchorBox); + } + + for (size_t n = 0; n < batchSize; ++n) { + std::vector matchIndices; + std::vector negIndices; + matchIndices.resize(numPriorBBoxes, -1); + size_t numGTBBoxes = 0; + if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n]; + if (!numGTBBoxes) { + matchIndicesVecPtr->push_back(matchIndices); + negIndicesVecPtr->push_back(negIndices); + continue; + } + std::vector> gtBBoxes; + if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n]; + auto startPos = gtValue.getData() + gtStartPosPtr[n] * 4; + for (size_t i = 0; i < numGTBBoxes; ++i) { + std::vector gtBBox; + gtBBox.push_back(*(startPos + i * 4 + 0)); + gtBBox.push_back(*(startPos + i * 4 + 1)); + gtBBox.push_back(*(startPos + i * 4 + 2)); + gtBBox.push_back(*(startPos + i * 4 + 3)); + gtBBoxes.push_back(gtBBox); + } + + std::vector overlaps(anchorBoxes.size() * gtBBoxes.size(), + -1); // init with -1 to label disabled anchors + bboxOverlaps(anchorBoxes, + gtBBoxes, + overlaps); // calculate the overlaps of anchors and gtBBoxes + + std::vector labels(anchorBoxes.size(), + -1); // init with -1 to label disabled anchors + std::pair numLabels = + labelAnchors(anchorBoxes, + gtBBoxes, + overlaps, + posOverlapThreshold, + negOverlapThreshold, + matchIndices, + labels); // lable the anchors + totalPos += numLabels.first; + totalNeg += numLabels.second; + matchIndicesVecPtr->push_back(matchIndices); + std::copy(labels.begin(), labels.end(), std::back_inserter(allLabels)); + } + + size_t numPos = boxBatchSize * boxFgRatio; + if (totalPos > numPos) { // subsample positive labels if we have too many + sampleAnchors(allLabels, 1, -1, numPos, totalPos); + } + size_t numNeg = boxBatchSize - numPos; + if (totalNeg > numNeg) { // subsample negative labels if we have too many + sampleAnchors(allLabels, 0, -1, numNeg, totalNeg); + } + + for (size_t n = 0; n < batchSize; ++n) { + std::vector negIndices; + for (size_t i = 0; i < numPriorBBoxes; ++i) { + size_t idx = n * numPriorBBoxes + i; + if (allLabels[idx] != 1) { + (*matchIndicesVecPtr)[n][i] = -1; + if (allLabels[idx] == 0) { + negIndices.push_back(i); + } + } + } + negIndicesVecPtr->push_back(negIndices); + } + + return std::make_pair(numPos, numNeg); +} + +void RPNLossLayer::encodeTarget(const std::vector& anchorBox, + const std::vector& gtBBox, + std::vector& target) { + real anchorBoxWidth = anchorBox[2] - anchorBox[0] + 1; + real anchorBoxHeight = anchorBox[3] - anchorBox[1] + 1; + real anchorBoxCenterX = (anchorBox[2] + anchorBox[0]) / 2; + real anchorBoxCenterY = (anchorBox[3] + anchorBox[1]) / 2; + + real gtBBoxWidth = gtBBox[2] - gtBBox[0] + 1; + real gtBBoxHeight = gtBBox[3] - gtBBox[1] + 1; + real gtBBoxCenterX = (gtBBox[2] + gtBBox[0]) / 2; + real gtBBoxCenterY = (gtBBox[3] + gtBBox[1]) / 2; + + target[0] = (gtBBoxCenterX - anchorBoxCenterX) / anchorBoxWidth; + target[1] = (gtBBoxCenterY - anchorBoxCenterY) / anchorBoxHeight; + target[2] = std::log(gtBBoxWidth / anchorBoxWidth); + target[3] = std::log(gtBBoxHeight / anchorBoxHeight); +} + +void RPNLossLayer::forward(PassType passType) { + Layer::forward(passType); + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + resetOutput(batchSize, 1); + + // all location data and confidence score data + locSizeSum_ = 0; + confSizeSum_ = 0; + for (size_t n = 0; n < inputNum_; ++n) { // there is only one for RPN + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + locSizeSum_ += inLoc->getElementCnt(); + confSizeSum_ += inConf->getElementCnt(); + } + + // locBuffer layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ...... + Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_); + locBuffer_ = locTmpBuffer_; + + // confBuffer layout: + // | class1 score | class2 score | ... |classN score | class1 score | ...... + Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_); + confBuffer_ = confTmpBuffer_; + + // concate location data and confidence score data + size_t locOffset = 0; + size_t confOffset = 0; + auto& layerConf = config_.inputs(0).rpn_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { // there is only one for RPN + const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n)); + const MatrixPtr inConf = getInputValue(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + locOffset += appendWithPermute(*inLoc, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locBuffer_, + kNCHWToNHWC); + confOffset += appendWithPermute(*inConf, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confBuffer_, + kNCHWToNHWC); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); + + // priorValue layout: + // | xmin1 | ymin1 | xmax1 | ymax1 | overflow_flag | img_width | img_height | + // | xmin2 | ...... + MatrixPtr priorValue; + + // labelValue layout: + // | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | ...... + MatrixPtr labelValue; + + // Copy data from GPU to CPU if use GPU + if (useGpu_) { + Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false); + Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false); + MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer()); + Matrix::resizeOrCreate( + priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false); + MatrixPtr labelTmpValue = getInputValue(*getLabelLayer()); + Matrix::resizeOrCreate(labelCpuValue_, + labelTmpValue->getHeight(), + labelTmpValue->getWidth(), + false, + false); + + locCpuBuffer_->copyFrom(*locTmpBuffer_); + confCpuBuffer_->copyFrom(*confTmpBuffer_); + priorCpuValue_->copyFrom(*priorTmpValue); + labelCpuValue_->copyFrom(*labelTmpValue); + + locBuffer_ = locCpuBuffer_; + confBuffer_ = confCpuBuffer_; + priorValue = priorCpuValue_; + labelValue = labelCpuValue_; + } else { + priorValue = getInputValue(*getPriorBoxLayer()); + labelValue = getInputValue(*getLabelLayer()); + } + + // Match anchor-box to groundtruth bbox + Argument label = getInput(*getLabelLayer()); + const int* labelIndex = label.sequenceStartPositions->getData(false); + size_t seqNum = label.getNumSequences(); + numMatches_ = 0; + numNegs_ = 0; + allMatchIndices_.clear(); + allNegIndices_.clear(); + numPriors_ = priorValue->getElementCnt() / 7; + + std::pair retPair = generateMatchIndices(*priorValue, + numPriors_, + *labelValue, + labelIndex, + seqNum, + batchSize, + posOverlapThreshold_, + negOverlapThreshold_, + rpnBatchSize_, + rpnFgRatio_, + &allMatchIndices_, + &allNegIndices_); + numMatches_ = retPair.first; + numNegs_ = retPair.second; + + // BBox location L1 smooth loss + locLoss_ = 0.0; + if (numMatches_ >= 1) { + size_t count = 0; + MatrixPtr locLossOutput; + Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false); + Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false); + locDiff_->zeroMem(); + std::vector locGTData; + + real* locDiffData = locDiff_->getData(); + const real* locBufferData = locBuffer_->getData(); + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; // match none + size_t locOffset = + n * (locBuffer_->getElementCnt() / batchSize) + i * 4; + std::copy(locBufferData + locOffset, + locBufferData + locOffset + 4, + locDiffData + count); + count += 4; + const int gtIdx = allMatchIndices_[n][i]; + auto* priorOffset = priorValue->getData() + i * 7; + std::vector anchorBox{ + *(priorOffset + 0), + *(priorOffset + 1), + *(priorOffset + 2), + *(priorOffset + 3), + }; + auto* labelOffset = labelValue->getData() + (labelIndex[n] + gtIdx) * 4; + std::vector gtBBox{ + *(labelOffset + 0), + *(labelOffset + 1), + *(labelOffset + 2), + *(labelOffset + 3), + }; + std::vector gtEncode(4); + encodeTarget(anchorBox, gtBBox, gtEncode); + locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end()); + } + } + locGTData_->copyFrom(&locGTData[0], numMatches_ * 4); + locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0); + locLoss_ = locLossOutput->getSum() / numMatches_ * lossRatio_; + } + + // BBox confidence softmax loss + confLoss_ = 0; + numConf_ = numMatches_ + numNegs_; + if (numConf_ >= 1) { + Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false); + IVector::resizeOrCreate(confGTData_, numConf_, false); + confProb_->zeroMem(); + size_t count = 0; + + std::vector confPredData; + real* confProbData = confProb_->getData(); + const real* confBufferData = confBuffer_->getData(); + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + confGTData_->getData()[count] = 1; + size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_; + std::copy(confBufferData + confOffset, + confBufferData + confOffset + numClasses_, + confProbData + count * numClasses_); + confPredData.reserve(confPredData.size() + numClasses_); + confPredData.insert(confPredData.end(), + confBufferData + confOffset, + confBufferData + confOffset + numClasses_); + ++count; + } + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + confGTData_->getData()[count] = backgroundId_; + size_t confOffset = + n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_; + std::copy(confBufferData + confOffset, + confBufferData + confOffset + numClasses_, + confProbData + count * numClasses_); + confPredData.reserve(confPredData.size() + numClasses_); + confPredData.insert(confPredData.end(), + confBufferData + confOffset, + confBufferData + confOffset + numClasses_); + ++count; + } + } + CHECK_EQ(numConf_, count); + confProb_->softmax(*confProb_); + MatrixPtr confLossOutput; + Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false); + confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_); + confLoss_ = confLossOutput->getSum() / numConf_; + } + real loss = locLoss_ + confLoss_; + MatrixPtr outV = getOutputValue(); + outV->assign(loss); +} + +void RPNLossLayer::backward(const UpdateCallback& callback) { + size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight(); + locBuffer_->zeroMem(); + confBuffer_->zeroMem(); + + // Back propagate on location prediction + if (numMatches_ >= 1) { + MatrixPtr locDiffBuffer; + Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false); + locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0); + locDiff_->copyFrom(*locDiffBuffer); + // scale gradient + for (size_t i = 0; i < numMatches_ * 4; ++i) + locDiff_->getData()[i] *= (1. / numMatches_ * lossRatio_); + // Copy gradient back + size_t count = 0; + const real* locDiffData = locDiff_->getData(); + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* locBufferData = + locBuffer_->getData() + n * numPriors_ * 4 + i * 4; + std::copy(locDiffData + count * 4, + locDiffData + (count + 1) * 4, + locBufferData); + ++count; + } + } + CHECK_EQ(count, numMatches_); + } + + if (numConf_ >= 1) { + for (size_t i = 0; i < numConf_; ++i) + confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1; + for (size_t i = 0; i < numConf_ * numClasses_; ++i) + confProb_->getData()[i] *= (1. / numConf_); + size_t count = 0; + const real* confProbData = confProb_->getData(); + for (size_t n = 0; n < batchSize; ++n) { + for (size_t i = 0; i < numPriors_; ++i) { + if (allMatchIndices_[n][i] == -1) continue; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + i * numClasses_; + std::copy(confProbData + count * numClasses_, + confProbData + (count + 1) * numClasses_, + confDiffData); + ++count; + } + for (size_t i = 0; i < allNegIndices_[n].size(); ++i) { + int idx = allNegIndices_[n][i]; + real* confDiffData = confBuffer_->getData() + + n * numPriors_ * numClasses_ + idx * numClasses_; + std::copy(confProbData + count * numClasses_, + confProbData + (count + 1) * numClasses_, + confDiffData); + ++count; + } + } + CHECK_EQ(count, numConf_); + } + if (useGpu_) { + locTmpBuffer_->copyFrom(*locCpuBuffer_); + confTmpBuffer_->copyFrom(*confCpuBuffer_); + locBuffer_ = locTmpBuffer_; + confBuffer_ = confTmpBuffer_; + } + // copy back + size_t locOffset = 0; + size_t confOffset = 0; + auto layerConf = config_.inputs(0).rpn_loss_conf(); + for (size_t n = 0; n < inputNum_; ++n) { + const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n)); + const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n)); + size_t height = getInput(*getLocInputLayer(n)).getFrameHeight(); + // only for unittest, there are no width and height information + // when constructing matrix in unittest, so we should + // set the shape in configuration + if (!height) height = layerConf.height(); + size_t width = getInput(*getLocInputLayer(n)).getFrameWidth(); + if (!width) width = layerConf.width(); + + // NHWC to NCHW + MatrixPtr locGBuffer; + Matrix::resizeOrCreate( + locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_); + MatrixPtr confGBuffer; + Matrix::resizeOrCreate( + confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_); + + locOffset += decomposeWithPermute(*locBuffer_, + height, + width, + locSizeSum_, + locOffset, + batchSize, + *locGBuffer, + kNHWCToNCHW); + inLocG->add(*locGBuffer); + confOffset += decomposeWithPermute(*confBuffer_, + height, + width, + confSizeSum_, + confOffset, + batchSize, + *confGBuffer, + kNHWCToNCHW); + inConfG->add(*confGBuffer); + } + CHECK_EQ(locOffset, locSizeSum_ / batchSize); + CHECK_EQ(confOffset, confSizeSum_ / batchSize); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/RPNLossLayer.h b/paddle/gserver/layers/RPNLossLayer.h new file mode 100644 index 0000000000000..60755f3685ed3 --- /dev/null +++ b/paddle/gserver/layers/RPNLossLayer.h @@ -0,0 +1,142 @@ +/* copyright (c) 2016 paddlepaddle authors. all rights reserve. + +licensed under the apache license, version 2.0 (the "license"); +you may not use this file except in compliance with the license. +you may obtain a copy of the license at + + http://www.apache.org/licenses/license-2.0 + +unless required by applicable law or agreed to in writing, software +distributed under the license is distributed on an "as is" basis, +without warranties or conditions of any kind, either express or implied. +see the license for the specific language governing permissions and +limitations under the license. */ + +#pragma once + +#include +#include "CostLayer.h" +#include "DataLayer.h" +#include "DetectionUtil.h" +#include "Layer.h" + +using std::vector; +using std::pair; + +namespace paddle { + +/** + * The loss layer for region proposal in Faster R-CNN. + * The loss is composed by the location loss and the confidence loss. + * The location loss is a smooth L1 loss and the confidence loss is + * a softmax loss. + * - Input: This layer needs four input layers: The first input layer + * is the anchor-box layer and the second layer is a label layer. + * The rest two input layers are convolution layers for generating + * bbox location offset and the classification confidence. + * - Output: The Region Proposal Networks loss value. + * Reference: + * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. + * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal + */ + +class RPNLossLayer : public CostLayer { +public: + explicit RPNLossLayer(const LayerConfig& config) + : CostLayer(config), rand_(0, 1) {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + + void backward(const UpdateCallback& callback = nullptr); + + void forwardImp(Matrix& output, Argument& label, Matrix& cost) {} + + void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {} + +protected: + inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; } + inline LayerPtr getLabelLayer() { return inputLayers_[1]; } + inline LayerPtr getLocInputLayer(size_t index) { + return inputLayers_[2 + index]; + } + inline LayerPtr getConfInputLayer(size_t index) { + return inputLayers_[2 + inputNum_ + index]; + } + + void bboxOverlaps(const std::vector>& anchorBoxes, + const std::vector>& gtBBoxes, + std::vector& overlaps); + + std::pair labelAnchors( + const std::vector>& anchorBoxes, + const std::vector>& gtBBoxes, + const std::vector& overlaps, + const real posOverlapThreshold, + const real negOverlapThreshold, + std::vector& matchIndices, + std::vector& labels); + + template + void sampleAnchors( + std::vector& allLabels, T label, T disabledLable, size_t m, size_t n); + + pair generateMatchIndices( + const Matrix& priorValue, + const size_t numPriorBBoxes, + const Matrix& gtValue, + const int* gtStartPosPtr, + const size_t seqNum, + const size_t batchSize, + const real posOverlapThreshold, + const real negOverlapThreshold, + const size_t boxBatchSize, + const real boxFgRatio, + std::vector>* matchIndicesVecPtr, + std::vector>* negIndicesVecPtr); + + void encodeTarget(const std::vector& anchorBox, + const std::vector& gtBBox, + std::vector& target); + +protected: + real posOverlapThreshold_; + real negOverlapThreshold_; + size_t rpnBatchSize_; + real rpnFgRatio_; + real lossRatio_; + size_t numClasses_; + size_t inputNum_; + size_t backgroundId_; + std::uniform_real_distribution rand_; + + real locLoss_; + real confLoss_; + + size_t numPriors_; + size_t numMatches_; + size_t numNegs_; + size_t numConf_; + size_t locSizeSum_; + size_t confSizeSum_; + + vector> allMatchIndices_; + vector> allNegIndices_; + MatrixPtr locGTData_; + IVectorPtr confGTData_; + + MatrixPtr locBuffer_; + MatrixPtr confBuffer_; + MatrixPtr locDiff_; + MatrixPtr confProb_; + + MatrixPtr labelCpuValue_; + MatrixPtr priorCpuValue_; + MatrixPtr locCpuBuffer_; + MatrixPtr confCpuBuffer_; + MatrixPtr locTmpBuffer_; + MatrixPtr confTmpBuffer_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 92f6cbcfe5a0e..f802b3efed123 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -52,6 +52,13 @@ add_unittest_without_exec(test_DetectionOutput add_test(NAME test_DetectionOutput COMMAND test_DetectionOutput) +################# test_Proposal ####################### +add_unittest_without_exec(test_Proposal + test_Proposal.cpp + LayerGradUtil.cpp) + +add_test(NAME test_Proposal + COMMAND test_Proposal) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify test_ConvUnify.cpp diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 67251f08e34fa..2c02ca666c4e3 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1766,6 +1766,75 @@ TEST(Layer, multibox_loss) { } } +TEST(Layer, rpn_loss) { + TestConfig config; + config.layerConfig.set_type("rpn_loss"); + config.biasSize = 0; + LayerInputConfig* input = config.layerConfig.add_inputs(); + RPNLossConfig* rpnLoss = input->mutable_rpn_loss_conf(); + rpnLoss->set_pos_overlap_threshold(0.7); + rpnLoss->set_neg_overlap_threshold(0.3); + rpnLoss->set_rpn_batch_size(2); + rpnLoss->set_rpn_fg_ratio(0.5); + rpnLoss->set_loss_ratio(10); + rpnLoss->set_height(3); + rpnLoss->set_width(3); + + size_t gtNum = 1; + MatrixPtr labelValue = Matrix::create(gtNum, 4, false, false); + real* labelData = labelValue->getData(); + size_t labelWidth = labelValue->getWidth(); + for (size_t i = 0; i < gtNum; ++i) { + *(labelData + i * labelWidth + 0) = 1; + *(labelData + i * labelWidth + 1) = 1; + *(labelData + i * labelWidth + 2) = 6; + *(labelData + i * labelWidth + 3) = 6; + } + vector seqStartPositions(gtNum + 1, 0); + for (size_t i = 1; i <= gtNum; ++i) { + seqStartPositions[i] = i; + } + + // Ensure at lease one matched bbox + MatrixPtr priorValue = Matrix::create(1, 63, false, false); + real* priorData = priorValue->getData(); + for (size_t i = 0; i < priorValue->getElementCnt() / 7; ++i) { + *priorData++ = std::rand() % 16; + *priorData++ = std::rand() % 16; + *priorData++ = std::rand() % 16 + std::rand() % 16; + *priorData++ = std::rand() % 16 + std::rand() % 16; + if (*(priorData - 2) < 16 && *(priorData - 1) < 16) { + *priorData++ = 1; + } else { + *priorData++ = -1; + } + *priorData++ = 16; + *priorData++ = 16; + } + priorData = priorValue->getData(); + *(priorData + 0) = 1; + *(priorData + 1) = 2; + *(priorData + 2) = 6; + *(priorData + 3) = 7; + *(priorData + 4) = 1; + *(priorData + 5) = 16; + *(priorData + 6) = 16; + + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "anchorbox", priorValue, {}}); + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions}); + config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0}); + config.inputDefs.push_back({INPUT_DATA, "confPred", 18, 0}); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "rpn_loss", 1, false, useGpu, false); + } +} + TEST(Layer, TransLayer) { TestConfig config; const int height = 128; diff --git a/paddle/gserver/tests/test_Proposal.cpp b/paddle/gserver/tests/test_Proposal.cpp new file mode 100644 index 0000000000000..28593363544d3 --- /dev/null +++ b/paddle/gserver/tests/test_Proposal.cpp @@ -0,0 +1,163 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +// Do one forward pass of priorBox layer and check to see if its output +// matches the given result +void doOneProposalTest(MatrixPtr& inputLoc, + MatrixPtr& inputConf, + MatrixPtr& inputAnchor, + size_t feature_map_width, + size_t feature_map_height, + real nms_threshold, + bool use_gpu, + MatrixPtr& result) { + // Setting up the detection output layer + TestConfig configt; + configt.layerConfig.set_type("proposal"); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + configt.layerConfig.add_inputs(); + + ProposalConfig* proposalConf = input->mutable_proposal_conf(); + proposalConf->set_width(feature_map_width); + proposalConf->set_height(feature_map_height); + proposalConf->set_nms_threshold(nms_threshold); + proposalConf->set_confidence_threshold(0.01); + proposalConf->set_nms_top_k(20); + proposalConf->set_keep_top_k(10); + proposalConf->set_min_width(0); + proposalConf->set_min_height(0); + configt.inputDefs.push_back({INPUT_DATA_TARGET, "anchors", 28, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0}); + configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0}); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "anchors", 1, false, use_gpu); + + dataLayers[0]->getOutputValue()->copyFrom(*inputAnchor); + dataLayers[1]->getOutputValue()->copyFrom(*inputLoc); + dataLayers[2]->getOutputValue()->copyFrom(*inputConf); + + // test layer initialize + bool store_FLAGS_use_gpu = FLAGS_use_gpu; + FLAGS_use_gpu = use_gpu; + std::vector parameters; + LayerPtr proposalLayer; + initTestLayer(configt, &layerMap, ¶meters, &proposalLayer); + FLAGS_use_gpu = store_FLAGS_use_gpu; + proposalLayer->forward(PASS_GC); + checkMatrixEqual(proposalLayer->getOutputValue(), result); +} + +TEST(Layer, detectionOutputLayerFwd) { + bool useGpu = false; + // CPU case 1. + MatrixPtr inputLoc; + MatrixPtr inputConf; + MatrixPtr inputAnchor; + MatrixPtr result; + real nmsThreshold = 0.01; + real inputLocData[] = {0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1}; + real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6}; + real inputAnchorData[] = {1, 1, 6, 6, 1, 16, 16, 1, 3, 6, 8, 1, 16, 16, + 1, 2, 6, 7, 1, 16, 16, 2, 1, 7, 6, 1, 16, 16}; + real resultData[] = {0, + 1, + 0.68997448, + 3.18894059, + 3.18894059, + 5.01105940, + 5.01105940, + 0, + 1, + 0.64565631, + 3.18894059, + 5.18894059, + 5.01105940, + 7.01105940}; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputAnchor = Matrix::create(1, 28, false, useGpu); + result = Matrix::create(2, 7, false, useGpu); + inputLoc->setData(inputLocData); + inputConf->setData(inputConfData); + inputAnchor->setData(inputAnchorData); + result->setData(resultData); + doOneProposalTest(inputLoc, + inputConf, + inputAnchor, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsThreshold, + useGpu, + result); + +#ifndef PADDLE_ONLY_CPU + // GPU case 1. + useGpu = true; + inputLoc = Matrix::create(1, 16, false, useGpu); + inputConf = Matrix::create(1, 8, false, useGpu); + inputAnchor = Matrix::create(1, 28, false, useGpu); + inputLoc->copyFrom(inputLocData, 16); + inputConf->copyFrom(inputConfData, 8); + inputAnchor->copyFrom(inputAnchorData, 32); + + nmsThreshold = 0.01; + result = Matrix::create(2, 7, false, useGpu); + result->copyFrom(resultData, 7); + doOneProposalTest(inputLoc, + inputConf, + inputAnchor, + /* feature_map_width */ 1, + /* feature_map_height */ 1, + nmsThreshold, + useGpu, + result); +#endif +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 37cd16c798907..9e934e022a734 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -289,6 +289,35 @@ message DetectionOutputConfig { optional uint32 width = 9 [default = 1]; } +message AnchorConfig { + required uint32 base_size = 1; + repeated float aspect_ratio = 2; + repeated float scale_ratio = 3; + optional uint32 feat_stride_x = 4 [default = 0]; + optional uint32 feat_stride_y = 5 [default = 0]; +} + +message RPNLossConfig { + required float pos_overlap_threshold = 1; + required float neg_overlap_threshold = 2; + required uint32 rpn_batch_size = 3; + required float rpn_fg_ratio = 4; + required float loss_ratio = 5; + optional uint32 height = 6 [default = 1]; + optional uint32 width = 7 [default = 1]; +} + +message ProposalConfig { + required float nms_threshold = 1; + required float confidence_threshold = 2; + required uint32 nms_top_k = 3; + required uint32 keep_top_k = 4; + required float min_width = 5; + required float min_height = 6; + optional uint32 height = 7 [default = 1]; + optional uint32 width = 8 [default = 1]; +} + message LayerInputConfig { required string input_layer_name = 1; optional string input_parameter_name = 2; @@ -309,6 +338,9 @@ message LayerInputConfig { optional RowConvConfig row_conv_conf = 15; optional MultiBoxLossConfig multibox_loss_conf = 16; optional DetectionOutputConfig detection_output_conf = 17; + optional AnchorConfig anchor_conf = 18; + optional RPNLossConfig rpn_loss_conf = 19; + optional ProposalConfig proposal_conf = 20; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 826ba2834a820..2476aefcee0d7 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1724,6 +1724,63 @@ def __init__(self, name, inputs, size, input_num, num_classes, self.config.size = size +@config_layer('anchor') +class AnchorLayer(LayerBase): + def __init__(self, + name, + inputs, + base_size, + aspect_ratio, + scale_ratio, + feat_stride_x=None, + feat_stride_y=None): + super(AnchorLayer, self).__init__(name, 'anchor', 0, inputs) + config_assert(len(inputs) == 2, 'AnchorLayer must have 3 inputs') + self.config.inputs[0].anchor_conf.base_size = base_size + self.config.inputs[0].anchor_conf.aspect_ratio.extend(aspect_ratio) + self.config.inputs[0].anchor_conf.scale_ratio.extend(scale_ratio) + self.config.inputs[0].anchor_conf.feat_stride_x = feat_stride_x + self.config.inputs[0].anchor_conf.feat_stride_y = feat_stride_y + + +@config_layer('rpn_loss') +class RPNLossLayer(LayerBase): + def __init__(self, name, inputs, pos_overlap_threshold, + neg_overlap_threshold, rpn_batch_size, rpn_fg_ratio, + loss_ratio): + super(RPNLossLayer, self).__init__(name, 'rpn_loss', 0, inputs) + config_assert(len(inputs) == 4, 'RPNLossLayer must have 3 inputs') + self.config.inputs[ + 0].rpn_loss_conf.pos_overlap_threshold = pos_overlap_threshold + self.config.inputs[ + 0].rpn_loss_conf.neg_overlap_threshold = neg_overlap_threshold + self.config.inputs[0].rpn_loss_conf.rpn_batch_size = rpn_batch_size + self.config.inputs[0].rpn_loss_conf.rpn_fg_ratio = rpn_fg_ratio + self.config.inputs[0].rpn_loss_conf.loss_ratio = loss_ratio + + +@config_layer('proposal') +class ProposalLayer(LayerBase): + def __init__(self, + name, + inputs, + nms_threshold, + confidence_threshold, + nms_top_k, + keep_top_k, + min_width=0, + min_height=0): + super(ProposalLayer, self).__init__(name, 'proposal', 0, inputs) + config_assert(len(inputs) == 3, 'ProposalLayer must have 3 inputs') + self.config.inputs[0].proposal_conf.nms_threshold = nms_threshold + self.config.inputs[ + 0].proposal_conf.confidence_threshold = confidence_threshold + self.config.inputs[0].proposal_conf.nms_top_k = nms_top_k + self.config.inputs[0].proposal_conf.keep_top_k = keep_top_k + self.config.inputs[0].proposal_conf.min_width = min_width + self.config.inputs[0].proposal_conf.min_height = min_height + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, name, size, height=None, width=None, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b0524a507bace..3fe86900ccaca 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -117,6 +117,9 @@ 'cross_channel_norm_layer', 'multibox_loss_layer', 'detection_output_layer', + 'anchor_layer', + 'rpn_loss_layer', + 'proposal_layer', 'spp_layer', 'pad_layer', 'eos_layer', @@ -199,6 +202,9 @@ class LayerType(object): PRIORBOX_LAYER = 'priorbox' MULTIBOX_LOSS_LAYER = 'multibox_loss' DETECTION_OUTPUT_LAYER = 'detection_output' + ANCHOR_LAYER = 'anchor' + RPN_LOSS_LAYER = 'rpn_loss' + PROPOSAL_LAYER = 'proposal' CTC_LAYER = 'ctc' WARP_CTC_LAYER = 'warp_ctc' @@ -1197,6 +1203,153 @@ def detection_output_layer(input_loc, name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size) +@wrap_name_default("anchor") +def anchor_layer(input, + image, + base_size, + aspect_ratio, + scale_ratio, + feat_stride_x=None, + feat_stride_y=None, + name=None): + """ + Generate the default anchor boxes. This layer is necessary for the + Region Proposal Networks of Faster R-CNN. + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput + :param image: The network input image. + :type image: LayerOutput + :param base_size: The basic anchor size. + :type base_size: int + :param aspect_ratio: The aspect ratio used to generate anchors. + :type aspect_ratio: list + :param scale_ratio: The scales used to generate anchors. + :type scale_ratio: list + :param feat_stride_x: The spatial scale between the image and feature map. + :type feat_stride: int + :param feat_stride_y: The spatial scale between the image and feature map. + :type feat_stride: int + :return: LayerOutput + """ + Layer( + name=name, + type=LayerType.ANCHOR_LAYER, + inputs=[input.name, image.name], + base_size=base_size, + aspect_ratio=aspect_ratio, + scale_ratio=scale_ratio, + feat_stride_x=feat_stride_x, + feat_stride_y=feat_stride_y) + return LayerOutput(name, LayerType.ANCHOR_LAYER, parents=[input, image]) + + +@wrap_name_default("rpn_loss") +def rpn_loss_layer(input_loc, + input_conf, + anchors, + label, + pos_overlap_threshold=0.7, + neg_overlap_threshold=0.3, + rpn_batch_size=256, + rpn_fg_ratio=0.5, + loss_ratio=10, + name=None): + """ + Compute the location loss and the confidence loss for the + Region Proposal Networks of Faster R-CNN. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict locations. + :type input_loc: LayerOutput + :param input_conf: The input anchor box confidence. + :type input_conf: LayerOutput + :param anchors: The input anchor boxes location. + :type anchors: LayerOutput + :param label: The input label. + :type label: LayerOutput + :param pos_overlap_threshold: The threshold of the overlap for foreground. + :type pos_overlap_threshold: float + :param neg_overlap_threshold: The threshold of the overlap for background. + :type neg_overlap_threshold: float + :param rpn_batch_size: The size of bbox batch for RPN training. + :type rpn_batch_size: int + :param rpn_fg_ratio: The ratio of the positive bbox in bbox batch + :type rpn_fg_ratio: float + :param loss_ratio: The ratio of location lossx to confidence loss. + :type loss_ratio: float + :return: LayerOutput + """ + Layer( + name=name, + type=LayerType.RPN_LOSS_LAYER, + inputs=[anchors.name, label.name, input_loc.name, input_conf.name], + pos_overlap_threshold=pos_overlap_threshold, + neg_overlap_threshold=neg_overlap_threshold, + rpn_batch_size=rpn_batch_size, + rpn_fg_ratio=rpn_fg_ratio, + loss_ratio=loss_ratio) + return LayerOutput( + name, + LayerType.RPN_LOSS_LAYER, + parents=[anchors.name, label.name, input_loc.name, input_conf.name]) + + +@wrap_name_default("proposal") +def proposal_layer(input_loc, + input_conf, + anchors, + nms_threshold=0.45, + confidence_threshold=0.01, + nms_top_k=400, + keep_top_k=200, + min_width=16, + min_height=16, + name=None): + """ + Apply the NMS to the output of RPN and compute the proposal location. + + :param name: The Layer Name. + :type name: basestring + :param input_loc: The input predict locations. + :type input_loc: LayerOutput. + :param input_conf: The input anchor box confidence. + :type input_conf: LayerOutput. + :param anchors: The input anchor boxes location. + :type priorbox: LayerOutput + :param nms_threshold: The Non-maximum suppression threshold. + :type nms_threshold: float + :param confidence_threshold: The classification confidence threshold + :type confidence_threshold: float + :param nms_top_k: The bbox number kept of the NMS's output + :type nms_top_k: int + :param keep_top_k: The bbox number kept of the layer's output + :type keep_top_k: int + :param min_width: The proposal width threshold. + :type min_width: float + :param min_height: The proposal height threshold. + :type min_height: float + :return: LayerOutput + """ + Layer( + name=name, + type=LayerType.PROPOSAL_LAYER, + inputs=[anchors.name, input_loc.name, input_conf.name], + nms_threshold=nms_threshold, + confidence_threshold=confidence_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + min_width=min_width, + min_height=min_height) + return LayerOutput( + name, + LayerType.PROPOSAL_LAYER, + parents=[anchors, input_loc, input_conf]) + + @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """