From 0ab22f888a804183505581c5dbbd9458cc6146e6 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 6 Jul 2017 16:55:06 +0800
Subject: [PATCH] add Region Proposal Network for Faster R-CNN, including
 AnchorLayer, RPNLossLayer and ProposalLayer

---
 paddle/gserver/layers/AnchorLayer.cpp         | 157 +++++
 paddle/gserver/layers/AnchorLayer.h           |  62 ++
 paddle/gserver/layers/ProposalLayer.cpp       | 353 +++++++++++
 paddle/gserver/layers/ProposalLayer.h         | 138 ++++
 paddle/gserver/layers/RPNLossLayer.cpp        | 589 ++++++++++++++++++
 paddle/gserver/layers/RPNLossLayer.h          | 142 +++++
 paddle/gserver/tests/CMakeLists.txt           |   7 +
 paddle/gserver/tests/test_LayerGrad.cpp       |  69 ++
 paddle/gserver/tests/test_Proposal.cpp        | 163 +++++
 proto/ModelConfig.proto                       |  32 +
 python/paddle/trainer/config_parser.py        |  57 ++
 .../paddle/trainer_config_helpers/layers.py   | 153 +++++
 12 files changed, 1922 insertions(+)
 create mode 100644 paddle/gserver/layers/AnchorLayer.cpp
 create mode 100644 paddle/gserver/layers/AnchorLayer.h
 create mode 100644 paddle/gserver/layers/ProposalLayer.cpp
 create mode 100644 paddle/gserver/layers/ProposalLayer.h
 create mode 100644 paddle/gserver/layers/RPNLossLayer.cpp
 create mode 100644 paddle/gserver/layers/RPNLossLayer.h
 create mode 100644 paddle/gserver/tests/test_Proposal.cpp
diff --git a/paddle/gserver/layers/AnchorLayer.cpp b/paddle/gserver/layers/AnchorLayer.cpp
new file mode 100644
index 0000000000000..78b9245a5ffc4
--- /dev/null
+++ b/paddle/gserver/layers/AnchorLayer.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AnchorLayer.h"
+#include <vector>
+
+namespace paddle {
+
+REGISTER_LAYER(anchor, AnchorLayer);
+
+bool AnchorLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const AnchorConfig& anchorConf = config_.inputs(0).anchor_conf();
+  std::copy(anchorConf.scale_ratio().begin(),
+            anchorConf.scale_ratio().end(),
+            std::back_inserter(anchorScales_));
+  std::copy(anchorConf.aspect_ratio().begin(),
+            anchorConf.aspect_ratio().end(),
+            std::back_inserter(anchorRatios_));
+  baseSize_ = anchorConf.base_size();
+  featStrideX_ = anchorConf.feat_stride_x();
+  featStrideY_ = anchorConf.feat_stride_y();
+  allowedBorder_ = 0;
+  return true;
+}
+
+void AnchorLayer::generateBaseAnchors() {
+  std::vector<real> baseAnchor{
+      0, 0, static_cast<real>(baseSize_ - 1), static_cast<real>(baseSize_ - 1)};
+  std::vector<std::vector<real>> ratioAnchors = enumRatio(baseAnchor);
+  for (size_t i = 0; i < ratioAnchors.size(); ++i) {
+    std::vector<std::vector<real>> tmpAnchors = enumScale(ratioAnchors[i]);
+    anchors_.insert(anchors_.end(), tmpAnchors.begin(), tmpAnchors.end());
+  }
+}
+
+std::vector<std::vector<real>> AnchorLayer::enumRatio(
+    const std::vector<real>& anchor) {
+  std::vector<std::vector<real>> ratioAnchors;
+  std::vector<real> whctr = anchor2whctr(anchor);
+  real ctrX = whctr[2];
+  real ctrY = whctr[3];
+  real size = whctr[0] * whctr[1];
+  for (size_t i = 0; i < anchorRatios_.size(); ++i) {
+    real ratioSize = size / anchorRatios_[i];
+    real ratioW = std::round(std::sqrt(ratioSize));
+    real ratioH = std::round(ratioW * anchorRatios_[i]);
+    ratioAnchors.push_back(whctr2anchor(ratioW, ratioH, ctrX, ctrY));
+  }
+  return ratioAnchors;
+}
+
+std::vector<std::vector<real>> AnchorLayer::enumScale(
+    const std::vector<real>& anchor) {
+  std::vector<std::vector<real>> scaleAnchors;
+  std::vector<real> whctr = anchor2whctr(anchor);
+  real w = whctr[0];
+  real h = whctr[1];
+  real ctrX = whctr[2];
+  real ctrY = whctr[3];
+  for (size_t i = 0; i < anchorScales_.size(); ++i) {
+    real scaleW = w * anchorScales_[i];
+    real scaleH = h * anchorScales_[i];
+    scaleAnchors.push_back(whctr2anchor(scaleW, scaleH, ctrX, ctrY));
+  }
+  return scaleAnchors;
+}
+
+std::vector<real> AnchorLayer::anchor2whctr(const std::vector<real>& anchor) {
+  std::vector<real> whctr;
+  whctr.push_back(anchor[2] - anchor[0] + 1);    // w
+  whctr.push_back(anchor[3] - anchor[1] + 1);    // h
+  whctr.push_back((anchor[2] + anchor[0]) / 2);  // ctrX
+  whctr.push_back((anchor[3] + anchor[1]) / 2);  // ctrY
+  return whctr;
+}
+
+std::vector<real> AnchorLayer::whctr2anchor(real w,
+                                            real h,
+                                            real ctrX,
+                                            real ctrY) {
+  std::vector<real> anchor;
+  anchor.push_back(ctrX - 0.5 * (w - 1));
+  anchor.push_back(ctrY - 0.5 * (h - 1));
+  anchor.push_back(ctrX + 0.5 * (w - 1));
+  anchor.push_back(ctrY + 0.5 * (h - 1));
+  return anchor;
+}
+
+void AnchorLayer::generateAllAnchors(size_t layerHeight,
+                                     size_t layerWidth,
+                                     size_t imageHeight,
+                                     size_t imageWidth) {
+  auto* tmpPtr = getOutputValue()->getData();
+  if (featStrideX_ == 0)
+    featStrideX_ = static_cast<real>(imageWidth) / layerWidth;
+  if (featStrideY_ == 0)
+    featStrideY_ = static_cast<real>(imageHeight) / layerHeight;
+  size_t idx = 0;
+  for (size_t h = 0; h < layerHeight; ++h) {
+    for (size_t w = 0; w < layerWidth; ++w) {
+      for (size_t i = 0; i < anchors_.size(); ++i) {
+        // xmin, ymin, xmax, ymax, overflow_flag, img_width, img_height.
+        tmpPtr[idx++] = anchors_[i][0] + h * featStrideX_;
+        tmpPtr[idx++] = anchors_[i][1] + w * featStrideY_;
+        tmpPtr[idx++] = anchors_[i][2] + h * featStrideX_;
+        tmpPtr[idx++] = anchors_[i][3] + w * featStrideY_;
+        if (tmpPtr[idx - 4] + allowedBorder_ >= 0 &&
+            tmpPtr[idx - 3] + allowedBorder_ >= 0 &&
+            tmpPtr[idx - 2] < imageWidth + allowedBorder_ &&
+            tmpPtr[idx - 1] <
+                imageHeight + allowedBorder_) {  // keep only inside anchors
+          tmpPtr[idx++] = 1;
+        } else {
+          tmpPtr[idx++] = -1;
+        }
+        tmpPtr[idx++] = imageWidth;
+        tmpPtr[idx++] =
+            imageHeight;  // to be used in proposal generation for box cliping
+      }
+    }
+  }
+}
+
+void AnchorLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  auto featMap = getInput(0);
+  size_t layerWidth = featMap.getFrameWidth();
+  size_t layerHeight = featMap.getFrameHeight();
+
+  auto image = getInput(1);
+  size_t imageWidth = image.getFrameWidth();
+  size_t imageHeight = image.getFrameHeight();
+
+  int dim = layerHeight * layerWidth * anchorScales_.size() *
+            anchorRatios_.size() * 5;
+  reserveOutput(1, dim);
+
+  generateBaseAnchors();
+  generateAllAnchors(layerHeight, layerWidth, imageHeight, imageWidth);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AnchorLayer.h b/paddle/gserver/layers/AnchorLayer.h
new file mode 100644
index 0000000000000..86b786793fc99
--- /dev/null
+++ b/paddle/gserver/layers/AnchorLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+/**
+ * @brief A layer used by Faster R-CNN to generate anchor-box locations.
+ * - Input: Two and only two input layer are accepted. The input layer must be
+ *          be a data output layer and a convolution output layer.
+ * - Output: The anchor-box locations of the input data.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class AnchorLayer : public Layer {
+public:
+  explicit AnchorLayer(const LayerConfig& config) : Layer(config) {}
+  ~AnchorLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
+
+protected:
+  size_t baseSize_;
+  size_t featStrideX_;
+  size_t featStrideY_;
+  size_t allowedBorder_;
+  std::vector<real> anchorScales_;
+  std::vector<real> anchorRatios_;
+  std::vector<std::vector<real>> anchors_;
+
+  void generateBaseAnchors();
+  std::vector<std::vector<real>> enumRatio(const std::vector<real>& anchor);
+  std::vector<std::vector<real>> enumScale(const std::vector<real>& anchor);
+  std::vector<real> anchor2whctr(const std::vector<real>& anchor);
+  std::vector<real> whctr2anchor(real w, real h, real ctrX, real ctrY);
+  void generateAllAnchors(size_t layerHeight,
+                          size_t layerWidth,
+                          size_t imageHeight,
+                          size_t imageWidth);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ProposalLayer.cpp b/paddle/gserver/layers/ProposalLayer.cpp
new file mode 100644
index 0000000000000..555343e330dbb
--- /dev/null
+++ b/paddle/gserver/layers/ProposalLayer.cpp
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ProposalLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(proposal, ProposalLayer);
+
+bool ProposalLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto& layerConf = config_.inputs(0).proposal_conf();
+  nmsThreshold_ = layerConf.nms_threshold();
+  confidenceThreshold_ = layerConf.confidence_threshold();
+  nmsTopK_ = layerConf.nms_top_k();
+  keepTopK_ = layerConf.keep_top_k();
+  minWidth_ = layerConf.min_width();
+  minHeight_ = layerConf.min_height();
+  numClasses_ = 2;
+  inputNum_ = 1;
+  backgroundId_ = 0;
+  return true;
+}
+
+real ProposalLayer::jaccardOverlap(const UnnormalizedBBox& bbox1,
+                                   const UnnormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+
+    real interWidth = interXMax - interXMin + 1;
+    real interHeight = interYMax - interYMin + 1;
+    real interArea = interWidth * interHeight;
+
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+
+void ProposalLayer::applyNMSFast(const vector<UnnormalizedBBox>& bboxes,
+                                 const real* confScoreData,
+                                 size_t classIdx,
+                                 size_t topK,
+                                 real confThreshold,
+                                 real nmsThreshold,
+                                 real minWidth,
+                                 real minHeight,
+                                 size_t numPriorBBoxes,
+                                 size_t numClasses,
+                                 vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    if (bboxes[i].getWidth() < minWidth || bboxes[i].getHeight() < minHeight) {
+      continue;  // remove predicted boxes with either height or width <
+                 // threshold
+    }
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t ProposalLayer::getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const real minWidth,
+    const real minHeight,
+    const vector<vector<UnnormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<UnnormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   minWidth,
+                   minHeight,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+
+void ProposalLayer::getDetectionOutput(
+    const real* confData,
+    const size_t numKept,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t batchSize,
+    const vector<map<size_t, vector<size_t>>>& allIndices,
+    const vector<vector<UnnormalizedBBox>>& allDecodedBBoxes,
+    Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<UnnormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        bufferData[count * 7 + 3] = decodedBBoxes[idx].xMin;
+        bufferData[count * 7 + 4] = decodedBBoxes[idx].yMin;
+        bufferData[count * 7 + 5] = decodedBBoxes[idx].xMax;
+        bufferData[count * 7 + 6] = decodedBBoxes[idx].yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+void ProposalLayer::decodeTarget(const std::vector<real>& anchorBoxData,
+                                 const std::vector<real>& locPredData,
+                                 UnnormalizedBBox& predBox) {
+  real anchorBoxWidth = anchorBoxData[2] - anchorBoxData[0] + 1;
+  real anchorBoxHeight = anchorBoxData[3] - anchorBoxData[1] + 1;
+  real anchorBoxCenterX = (anchorBoxData[2] + anchorBoxData[0]) / 2;
+  real anchorBoxCenterY = (anchorBoxData[3] + anchorBoxData[1]) / 2;
+
+  real dx = locPredData[0];
+  real dy = locPredData[1];
+  real dw = locPredData[2];
+  real dh = locPredData[3];
+
+  real predCtrX = dx * anchorBoxWidth + anchorBoxCenterX;
+  real predCtrY = dy * anchorBoxHeight + anchorBoxCenterY;
+  real predWidth = std::exp(dw * anchorBoxWidth);
+  real predHeight = std::exp(dh * anchorBoxHeight);
+
+  // clip predicted box to image
+  real xMin = static_cast<real>(0.);
+  real yMin = static_cast<real>(0.);
+  real xMax = anchorBoxData[5] - 1;
+  real yMax = anchorBoxData[6] - 1;
+  predBox.xMin = std::min(
+      std::max(static_cast<real>(predCtrX - 0.5 * predWidth), xMin), xMax);
+  predBox.yMin = std::min(
+      std::max(static_cast<real>(predCtrY - 0.5 * predHeight), yMin), yMax);
+  predBox.xMax = std::min(
+      std::max(static_cast<real>(predCtrX + 0.5 * predWidth), xMin), xMax);
+  predBox.yMax = std::min(
+      std::max(static_cast<real>(predCtrY + 0.5 * predHeight), yMin), yMax);
+}
+
+void ProposalLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
+
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).proposal_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locTmpBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confTmpBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  MatrixPtr priorValue;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(
+        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  confBuffer_->softmax(*confBuffer_);
+
+  size_t numPriors = priorValue->getElementCnt() / 7;
+  std::vector<std::vector<UnnormalizedBBox>> allDecodedBBoxes;
+  for (size_t n = 0; n < batchSize; ++n) {
+    std::vector<UnnormalizedBBox> decodedBBoxes;
+    for (size_t i = 0; i < numPriors; ++i) {
+      size_t priorOffset = i * 7;
+      std::vector<real> anchorBoxData;
+      for (size_t j = 0; j < 7; ++j)
+        anchorBoxData.push_back(*(priorValue->getData() + priorOffset + j));
+      size_t locPredOffset = n * numPriors * 4 + i * 4;
+      std::vector<real> locPredData;
+      for (size_t j = 0; j < 4; ++j)
+        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
+      UnnormalizedBBox bbox;
+      decodeTarget(anchorBoxData, locPredData, bbox);
+      decodedBBoxes.push_back(bbox);
+    }
+    allDecodedBBoxes.push_back(decodedBBoxes);
+  }
+
+  std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
+  size_t numKept = getDetectionIndices(confBuffer_->getData(),
+                                       numPriors,
+                                       numClasses_,
+                                       backgroundId_,
+                                       batchSize,
+                                       confidenceThreshold_,
+                                       nmsTopK_,
+                                       nmsThreshold_,
+                                       keepTopK_,
+                                       minWidth_,
+                                       minHeight_,
+                                       allDecodedBBoxes,
+                                       &allIndices);
+
+  resetOutput(numKept, 7);
+  MatrixPtr outV = getOutputValue();
+  getDetectionOutput(confBuffer_->getData(),
+                     numKept,
+                     numPriors,
+                     numClasses_,
+                     batchSize,
+                     allIndices,
+                     allDecodedBBoxes,
+                     *outV);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ProposalLayer.h b/paddle/gserver/layers/ProposalLayer.h
new file mode 100644
index 0000000000000..d41e888fcc56c
--- /dev/null
+++ b/paddle/gserver/layers/ProposalLayer.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+/**
+ * The detection output layer to generate proposals in RPN of Faster R-CNN.
+ * This layer applies Non-maximum suppression to the all predicted bounding
+ * box and keeps the Top-K bounding boxes.
+ * - Input: This layer needs three input layers: The first input layer
+ *          is the anchor layer. The rest two input layers are convolution
+ *          layers for generating bbox location offset and the classification
+ *          confidence.
+ * - Output: The predict bounding box locations.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ */
+
+class ProposalLayer : public Layer {
+public:
+  explicit ProposalLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[1 + index];
+  }
+
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[1 + inputNum_ + index];
+  }
+
+  struct UnnormalizedBBox : BBoxBase<real> {
+    UnnormalizedBBox() : BBoxBase<real>() {}
+    real getWidth() const { return xMax - xMin + 1; }
+    real getHeight() const { return yMax - yMin + 1; }
+  };
+
+  real jaccardOverlap(const UnnormalizedBBox& bbox1,
+                      const UnnormalizedBBox& bbox2);
+
+  void applyNMSFast(const vector<UnnormalizedBBox>& bboxes,
+                    const real* confScoreData,
+                    size_t classIdx,
+                    size_t topK,
+                    real confThreshold,
+                    real nmsThreshold,
+                    real minWidth,
+                    real minHeight,
+                    size_t numPriorBBoxes,
+                    size_t numClasses,
+                    vector<size_t>* indices);
+
+  size_t getDetectionIndices(
+      const real* confData,
+      const size_t numPriorBBoxes,
+      const size_t numClasses,
+      const size_t backgroundId,
+      const size_t batchSize,
+      const size_t confThreshold,
+      const size_t nmsTopK,
+      const real nmsThreshold,
+      const size_t keepTopK,
+      const real minWidth,
+      const real minHeight,
+      const vector<vector<UnnormalizedBBox>>& allDecodedBBoxes,
+      vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+  void getDetectionOutput(
+      const real* confData,
+      const size_t numKept,
+      const size_t numPriorBBoxes,
+      const size_t numClasses,
+      const size_t batchSize,
+      const vector<map<size_t, vector<size_t>>>& allIndices,
+      const vector<vector<UnnormalizedBBox>>& allDecodedBBoxes,
+      Matrix& out);
+
+  void decodeTarget(const std::vector<real>& anchorBoxData,
+                    const std::vector<real>& locPredData,
+                    UnnormalizedBBox& predBox);
+
+private:
+  real nmsThreshold_;
+  real confidenceThreshold_;
+  size_t nmsTopK_;
+  size_t keepTopK_;
+  real minWidth_;
+  real minHeight_;
+
+  size_t numClasses_;
+  size_t inputNum_;
+  size_t backgroundId_;
+
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RPNLossLayer.cpp b/paddle/gserver/layers/RPNLossLayer.cpp
new file mode 100644
index 0000000000000..7ea4d80e84472
--- /dev/null
+++ b/paddle/gserver/layers/RPNLossLayer.cpp
@@ -0,0 +1,589 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RPNLossLayer.h"
+#include <float.h>
+#include <vector>
+#include "DataLayer.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+REGISTER_LAYER(rpn_loss, RPNLossLayer);
+
+bool RPNLossLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  auto layerConf = config_.inputs(0).rpn_loss_conf();
+  posOverlapThreshold_ = layerConf.pos_overlap_threshold();
+  negOverlapThreshold_ = layerConf.neg_overlap_threshold();
+  rpnBatchSize_ = layerConf.rpn_batch_size();
+  rpnFgRatio_ = layerConf.rpn_fg_ratio();
+  lossRatio_ = layerConf.loss_ratio();
+  numClasses_ = 2;
+  inputNum_ = 1;
+  backgroundId_ = 0;
+
+  return true;
+}
+
+void RPNLossLayer::bboxOverlaps(
+    const std::vector<std::vector<real>>& anchorBoxes,
+    const std::vector<std::vector<real>>& gtBBoxes,
+    std::vector<real>& overlaps) {
+  for (size_t i = 0; i < anchorBoxes.size(); ++i) {
+    if (anchorBoxes[i][4] == -1) {
+      continue;  // out of the image, keep only inside anchors
+    }
+    for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+      real width = std::min(anchorBoxes[i][2], gtBBoxes[j][2]) -
+                   std::max(anchorBoxes[i][0], gtBBoxes[j][0]) + 1;
+      real height = std::min(anchorBoxes[i][3], gtBBoxes[j][3]) -
+                    std::max(anchorBoxes[i][1], gtBBoxes[j][1]) + 1;
+      if (width > 0 && height > 0) {
+        real gtboxArea = (gtBBoxes[j][2] - gtBBoxes[j][0] + 1) *
+                         (gtBBoxes[j][3] - gtBBoxes[j][1] + 1);
+        real anchorArea = (anchorBoxes[i][2] - anchorBoxes[i][0] + 1) *
+                          (anchorBoxes[i][3] - anchorBoxes[i][1] + 1);
+        real overlapArea = width * height;
+        overlaps[i * gtBBoxes.size() + j] =
+            overlapArea / (gtboxArea + anchorArea - overlapArea);
+      }
+    }
+  }
+}
+
+std::pair<size_t, size_t> RPNLossLayer::labelAnchors(
+    const std::vector<std::vector<real>>& anchorBoxes,
+    const std::vector<std::vector<real>>& gtBBoxes,
+    const std::vector<real>& overlaps,
+    const real posOverlapThreshold,
+    const real negOverlapThreshold,
+    std::vector<int>& matchIndices,
+    std::vector<int>& labels) {
+  size_t numPos = 0;
+  size_t numNeg = 0;
+  std::vector<int> gtBBoxMaxIdxs(
+      gtBBoxes.size(), -1);  // anchor index with max overlap of each gtBBox
+  for (size_t n = 0; n < overlaps.size(); ++n) {
+    size_t anchorIdx = n / gtBBoxes.size();
+    size_t gtBBoxIdx = n % gtBBoxes.size();
+    if (matchIndices[anchorIdx] == -1 ||
+        overlaps[n] >
+            overlaps[anchorIdx * gtBBoxes.size() + matchIndices[anchorIdx]]) {
+      matchIndices[anchorIdx] = gtBBoxIdx;  // overlaps.argmax(axis=1)
+    }
+    if (gtBBoxMaxIdxs[gtBBoxIdx] == -1 ||
+        overlaps[n] >
+            overlaps[gtBBoxMaxIdxs[gtBBoxIdx] * gtBBoxes.size() + gtBBoxIdx]) {
+      gtBBoxMaxIdxs[gtBBoxIdx] = anchorIdx;  // overlaps.argmax(axis=0)
+    }
+  }
+  for (size_t n = 0; n < gtBBoxMaxIdxs.size();
+       ++n) {  // fg label: anchor with highest overlap for each gtBBox
+    if (overlaps[gtBBoxMaxIdxs[n] * gtBBoxes.size() + n] > 0) {
+      labels[gtBBoxMaxIdxs[n]] = 1;
+    }
+  }
+  for (size_t n = 0; n < anchorBoxes.size();
+       ++n) {  // fg/bg/disabled label: above/below threshold IOU
+    if (overlaps[n * gtBBoxes.size() + matchIndices[n]] >=
+        posOverlapThreshold) {
+      labels[n] = 1;
+    } else if (overlaps[n * gtBBoxes.size() + matchIndices[n]] <=
+               negOverlapThreshold) {
+      if (overlaps[n * gtBBoxes.size() + matchIndices[n]] < 0) {
+        labels[n] = -1;  // out of the image
+      } else {
+        labels[n] = 0;
+      }
+    }
+  }
+  for (size_t n = 0; n < labels.size(); ++n) {
+    if (labels[n] == 1) {
+      ++numPos;
+    } else if (labels[n] == 0) {
+      ++numNeg;
+    }
+  }
+  return std::make_pair(numPos, numNeg);
+}
+
+template <typename T>
+void RPNLossLayer::sampleAnchors(
+    std::vector<T>& allLabels, T label, T disabledLable, size_t m, size_t n) {
+  auto& randEngine = ThreadLocalRandomEngine::get();
+  for (size_t i = 0; i < allLabels.size(); ++i) {
+    if (allLabels[i] == label) {
+      if (rand_(randEngine) * n < m) {
+        --m;
+      } else {
+        allLabels[i] = disabledLable;
+      }
+      --n;
+    }
+  }
+}
+
+pair<size_t, size_t> RPNLossLayer::generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const size_t batchSize,
+    const real posOverlapThreshold,
+    const real negOverlapThreshold,
+    const size_t boxBatchSize,
+    const real boxFgRatio,
+    std::vector<std::vector<int>>* matchIndicesVecPtr,
+    std::vector<std::vector<int>>* negIndicesVecPtr) {
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  std::vector<real> allLabels;
+  std::vector<real> allTargets;
+
+  std::vector<std::vector<real>> anchorBoxes;
+  const real* priorData = priorValue.getData();
+  for (size_t n = 0; n < numPriorBBoxes; ++n) {
+    std::vector<real> anchorBox;
+    anchorBox.push_back(*(priorData + n * 7 + 0));
+    anchorBox.push_back(*(priorData + n * 7 + 1));
+    anchorBox.push_back(*(priorData + n * 7 + 2));
+    anchorBox.push_back(*(priorData + n * 7 + 3));
+    anchorBox.push_back(*(priorData + n * 7 + 4));
+    anchorBoxes.push_back(anchorBox);
+  }
+
+  for (size_t n = 0; n < batchSize; ++n) {
+    std::vector<int> matchIndices;
+    std::vector<int> negIndices;
+    matchIndices.resize(numPriorBBoxes, -1);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    std::vector<std::vector<real>> gtBBoxes;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    auto startPos = gtValue.getData() + gtStartPosPtr[n] * 4;
+    for (size_t i = 0; i < numGTBBoxes; ++i) {
+      std::vector<real> gtBBox;
+      gtBBox.push_back(*(startPos + i * 4 + 0));
+      gtBBox.push_back(*(startPos + i * 4 + 1));
+      gtBBox.push_back(*(startPos + i * 4 + 2));
+      gtBBox.push_back(*(startPos + i * 4 + 3));
+      gtBBoxes.push_back(gtBBox);
+    }
+
+    std::vector<real> overlaps(anchorBoxes.size() * gtBBoxes.size(),
+                               -1);  // init with -1 to label disabled anchors
+    bboxOverlaps(anchorBoxes,
+                 gtBBoxes,
+                 overlaps);  // calculate the overlaps of anchors and gtBBoxes
+
+    std::vector<int> labels(anchorBoxes.size(),
+                            -1);  // init with -1 to label disabled anchors
+    std::pair<size_t, size_t> numLabels =
+        labelAnchors(anchorBoxes,
+                     gtBBoxes,
+                     overlaps,
+                     posOverlapThreshold,
+                     negOverlapThreshold,
+                     matchIndices,
+                     labels);  // lable the anchors
+    totalPos += numLabels.first;
+    totalNeg += numLabels.second;
+    matchIndicesVecPtr->push_back(matchIndices);
+    std::copy(labels.begin(), labels.end(), std::back_inserter(allLabels));
+  }
+
+  size_t numPos = boxBatchSize * boxFgRatio;
+  if (totalPos > numPos) {  // subsample positive labels if we have too many
+    sampleAnchors<real>(allLabels, 1, -1, numPos, totalPos);
+  }
+  size_t numNeg = boxBatchSize - numPos;
+  if (totalNeg > numNeg) {  // subsample negative labels if we have too many
+    sampleAnchors<real>(allLabels, 0, -1, numNeg, totalNeg);
+  }
+
+  for (size_t n = 0; n < batchSize; ++n) {
+    std::vector<int> negIndices;
+    for (size_t i = 0; i < numPriorBBoxes; ++i) {
+      size_t idx = n * numPriorBBoxes + i;
+      if (allLabels[idx] != 1) {
+        (*matchIndicesVecPtr)[n][i] = -1;
+        if (allLabels[idx] == 0) {
+          negIndices.push_back(i);
+        }
+      }
+    }
+    negIndicesVecPtr->push_back(negIndices);
+  }
+
+  return std::make_pair(numPos, numNeg);
+}
+
+void RPNLossLayer::encodeTarget(const std::vector<real>& anchorBox,
+                                const std::vector<real>& gtBBox,
+                                std::vector<real>& target) {
+  real anchorBoxWidth = anchorBox[2] - anchorBox[0] + 1;
+  real anchorBoxHeight = anchorBox[3] - anchorBox[1] + 1;
+  real anchorBoxCenterX = (anchorBox[2] + anchorBox[0]) / 2;
+  real anchorBoxCenterY = (anchorBox[3] + anchorBox[1]) / 2;
+
+  real gtBBoxWidth = gtBBox[2] - gtBBox[0] + 1;
+  real gtBBoxHeight = gtBBox[3] - gtBBox[1] + 1;
+  real gtBBoxCenterX = (gtBBox[2] + gtBBox[0]) / 2;
+  real gtBBoxCenterY = (gtBBox[3] + gtBBox[1]) / 2;
+
+  target[0] = (gtBBoxCenterX - anchorBoxCenterX) / anchorBoxWidth;
+  target[1] = (gtBBoxCenterY - anchorBoxCenterY) / anchorBoxHeight;
+  target[2] = std::log(gtBBoxWidth / anchorBoxWidth);
+  target[3] = std::log(gtBBoxHeight / anchorBoxHeight);
+}
+
+void RPNLossLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  resetOutput(batchSize, 1);
+
+  // all location data and confidence score data
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {  // there is only one for RPN
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  // locBuffer layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+
+  // confBuffer layout:
+  // | class1 score | class2 score | ... |classN score | class1 score | ......
+  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
+  confBuffer_ = confTmpBuffer_;
+
+  // concate location data and confidence score data
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).rpn_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {  // there is only one for RPN
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  // priorValue layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | overflow_flag | img_width | img_height |
+  // | xmin2 | ......
+  MatrixPtr priorValue;
+
+  // labelValue layout:
+  // | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | ......
+  MatrixPtr labelValue;
+
+  // Copy data from GPU to CPU if use GPU
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
+    Matrix::resizeOrCreate(labelCpuValue_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+    labelCpuValue_->copyFrom(*labelTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+    labelValue = labelCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    labelValue = getInputValue(*getLabelLayer());
+  }
+
+  // Match anchor-box to groundtruth bbox
+  Argument label = getInput(*getLabelLayer());
+  const int* labelIndex = label.sequenceStartPositions->getData(false);
+  size_t seqNum = label.getNumSequences();
+  numMatches_ = 0;
+  numNegs_ = 0;
+  allMatchIndices_.clear();
+  allNegIndices_.clear();
+  numPriors_ = priorValue->getElementCnt() / 7;
+
+  std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
+                                                           numPriors_,
+                                                           *labelValue,
+                                                           labelIndex,
+                                                           seqNum,
+                                                           batchSize,
+                                                           posOverlapThreshold_,
+                                                           negOverlapThreshold_,
+                                                           rpnBatchSize_,
+                                                           rpnFgRatio_,
+                                                           &allMatchIndices_,
+                                                           &allNegIndices_);
+  numMatches_ = retPair.first;
+  numNegs_ = retPair.second;
+
+  // BBox location L1 smooth loss
+  locLoss_ = 0.0;
+  if (numMatches_ >= 1) {
+    size_t count = 0;
+    MatrixPtr locLossOutput;
+    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
+    locDiff_->zeroMem();
+    std::vector<real> locGTData;
+
+    real* locDiffData = locDiff_->getData();
+    const real* locBufferData = locBuffer_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;  // match none
+        size_t locOffset =
+            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
+        std::copy(locBufferData + locOffset,
+                  locBufferData + locOffset + 4,
+                  locDiffData + count);
+        count += 4;
+        const int gtIdx = allMatchIndices_[n][i];
+        auto* priorOffset = priorValue->getData() + i * 7;
+        std::vector<real> anchorBox{
+            *(priorOffset + 0),
+            *(priorOffset + 1),
+            *(priorOffset + 2),
+            *(priorOffset + 3),
+        };
+        auto* labelOffset = labelValue->getData() + (labelIndex[n] + gtIdx) * 4;
+        std::vector<real> gtBBox{
+            *(labelOffset + 0),
+            *(labelOffset + 1),
+            *(labelOffset + 2),
+            *(labelOffset + 3),
+        };
+        std::vector<real> gtEncode(4);
+        encodeTarget(anchorBox, gtBBox, gtEncode);
+        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
+      }
+    }
+    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
+    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
+    locLoss_ = locLossOutput->getSum() / numMatches_ * lossRatio_;
+  }
+
+  // BBox confidence softmax loss
+  confLoss_ = 0;
+  numConf_ = numMatches_ + numNegs_;
+  if (numConf_ >= 1) {
+    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
+    IVector::resizeOrCreate(confGTData_, numConf_, false);
+    confProb_->zeroMem();
+    size_t count = 0;
+
+    std::vector<real> confPredData;
+    real* confProbData = confProb_->getData();
+    const real* confBufferData = confBuffer_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        confGTData_->getData()[count] = 1;
+        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
+        ++count;
+      }
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        confGTData_->getData()[count] = backgroundId_;
+        size_t confOffset =
+            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
+        ++count;
+      }
+    }
+    CHECK_EQ(numConf_, count);
+    confProb_->softmax(*confProb_);
+    MatrixPtr confLossOutput;
+    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
+    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
+    confLoss_ = confLossOutput->getSum() / numConf_;
+  }
+  real loss = locLoss_ + confLoss_;
+  MatrixPtr outV = getOutputValue();
+  outV->assign(loss);
+}
+
+void RPNLossLayer::backward(const UpdateCallback& callback) {
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  locBuffer_->zeroMem();
+  confBuffer_->zeroMem();
+
+  // Back propagate on location prediction
+  if (numMatches_ >= 1) {
+    MatrixPtr locDiffBuffer;
+    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
+    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
+    locDiff_->copyFrom(*locDiffBuffer);
+    // scale gradient
+    for (size_t i = 0; i < numMatches_ * 4; ++i)
+      locDiff_->getData()[i] *= (1. / numMatches_ * lossRatio_);
+    // Copy gradient back
+    size_t count = 0;
+    const real* locDiffData = locDiff_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* locBufferData =
+            locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
+        std::copy(locDiffData + count * 4,
+                  locDiffData + (count + 1) * 4,
+                  locBufferData);
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numMatches_);
+  }
+
+  if (numConf_ >= 1) {
+    for (size_t i = 0; i < numConf_; ++i)
+      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
+    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
+      confProb_->getData()[i] *= (1. / numConf_);
+    size_t count = 0;
+    const real* confProbData = confProb_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + i * numClasses_;
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
+        ++count;
+      }
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        int idx = allNegIndices_[n][i];
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + idx * numClasses_;
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numConf_);
+  }
+  if (useGpu_) {
+    locTmpBuffer_->copyFrom(*locCpuBuffer_);
+    confTmpBuffer_->copyFrom(*confCpuBuffer_);
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  // copy back
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto layerConf = config_.inputs(0).rpn_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
+    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    // only for unittest, there are no width and height information
+    // when constructing matrix in unittest, so we should
+    // set the shape in configuration
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+
+    // NHWC to NCHW
+    MatrixPtr locGBuffer;
+    Matrix::resizeOrCreate(
+        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
+    MatrixPtr confGBuffer;
+    Matrix::resizeOrCreate(
+        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
+
+    locOffset += decomposeWithPermute(*locBuffer_,
+                                      height,
+                                      width,
+                                      locSizeSum_,
+                                      locOffset,
+                                      batchSize,
+                                      *locGBuffer,
+                                      kNHWCToNCHW);
+    inLocG->add(*locGBuffer);
+    confOffset += decomposeWithPermute(*confBuffer_,
+                                       height,
+                                       width,
+                                       confSizeSum_,
+                                       confOffset,
+                                       batchSize,
+                                       *confGBuffer,
+                                       kNHWCToNCHW);
+    inConfG->add(*confGBuffer);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RPNLossLayer.h b/paddle/gserver/layers/RPNLossLayer.h
new file mode 100644
index 0000000000000..60755f3685ed3
--- /dev/null
+++ b/paddle/gserver/layers/RPNLossLayer.h
@@ -0,0 +1,142 @@
+/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
+
+licensed under the apache license, version 2.0 (the "license");
+you may not use this file except in compliance with the license.
+you may obtain a copy of the license at
+
+    http://www.apache.org/licenses/license-2.0
+
+unless required by applicable law or agreed to in writing, software
+distributed under the license is distributed on an "as is" basis,
+without warranties or conditions of any kind, either express or implied.
+see the license for the specific language governing permissions and
+limitations under the license. */
+
+#pragma once
+
+#include <vector>
+#include "CostLayer.h"
+#include "DataLayer.h"
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The loss layer for region proposal in Faster R-CNN.
+ * The loss is composed by the location loss and the confidence loss.
+ * The location loss is a smooth L1 loss and the confidence loss is
+ * a softmax loss.
+ * - Input: This layer needs four input layers: The first input layer
+ *          is the anchor-box layer and the second layer is a label layer.
+ *          The rest two input layers are convolution layers for generating
+ *          bbox location offset and the classification confidence.
+ * - Output: The Region Proposal Networks loss value.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ */
+
+class RPNLossLayer : public CostLayer {
+public:
+  explicit RPNLossLayer(const LayerConfig& config)
+      : CostLayer(config), rand_(0, 1) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[2 + index];
+  }
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[2 + inputNum_ + index];
+  }
+
+  void bboxOverlaps(const std::vector<std::vector<real>>& anchorBoxes,
+                    const std::vector<std::vector<real>>& gtBBoxes,
+                    std::vector<real>& overlaps);
+
+  std::pair<size_t, size_t> labelAnchors(
+      const std::vector<std::vector<real>>& anchorBoxes,
+      const std::vector<std::vector<real>>& gtBBoxes,
+      const std::vector<real>& overlaps,
+      const real posOverlapThreshold,
+      const real negOverlapThreshold,
+      std::vector<int>& matchIndices,
+      std::vector<int>& labels);
+
+  template <typename T>
+  void sampleAnchors(
+      std::vector<T>& allLabels, T label, T disabledLable, size_t m, size_t n);
+
+  pair<size_t, size_t> generateMatchIndices(
+      const Matrix& priorValue,
+      const size_t numPriorBBoxes,
+      const Matrix& gtValue,
+      const int* gtStartPosPtr,
+      const size_t seqNum,
+      const size_t batchSize,
+      const real posOverlapThreshold,
+      const real negOverlapThreshold,
+      const size_t boxBatchSize,
+      const real boxFgRatio,
+      std::vector<std::vector<int>>* matchIndicesVecPtr,
+      std::vector<std::vector<int>>* negIndicesVecPtr);
+
+  void encodeTarget(const std::vector<real>& anchorBox,
+                    const std::vector<real>& gtBBox,
+                    std::vector<real>& target);
+
+protected:
+  real posOverlapThreshold_;
+  real negOverlapThreshold_;
+  size_t rpnBatchSize_;
+  real rpnFgRatio_;
+  real lossRatio_;
+  size_t numClasses_;
+  size_t inputNum_;
+  size_t backgroundId_;
+  std::uniform_real_distribution<real> rand_;
+
+  real locLoss_;
+  real confLoss_;
+
+  size_t numPriors_;
+  size_t numMatches_;
+  size_t numNegs_;
+  size_t numConf_;
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  vector<vector<int>> allMatchIndices_;
+  vector<vector<int>> allNegIndices_;
+  MatrixPtr locGTData_;
+  IVectorPtr confGTData_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locDiff_;
+  MatrixPtr confProb_;
+
+  MatrixPtr labelCpuValue_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 92f6cbcfe5a0e..f802b3efed123 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -52,6 +52,13 @@ add_unittest_without_exec(test_DetectionOutput
 
 add_test(NAME test_DetectionOutput 
     COMMAND test_DetectionOutput)
+################# test_Proposal #######################
+add_unittest_without_exec(test_Proposal
+    test_Proposal.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_Proposal 
+    COMMAND test_Proposal)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 67251f08e34fa..2c02ca666c4e3 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1766,6 +1766,75 @@ TEST(Layer, multibox_loss) {
   }
 }
 
+TEST(Layer, rpn_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("rpn_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RPNLossConfig* rpnLoss = input->mutable_rpn_loss_conf();
+  rpnLoss->set_pos_overlap_threshold(0.7);
+  rpnLoss->set_neg_overlap_threshold(0.3);
+  rpnLoss->set_rpn_batch_size(2);
+  rpnLoss->set_rpn_fg_ratio(0.5);
+  rpnLoss->set_loss_ratio(10);
+  rpnLoss->set_height(3);
+  rpnLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 4, false, false);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth + 0) = 1;
+    *(labelData + i * labelWidth + 1) = 1;
+    *(labelData + i * labelWidth + 2) = 6;
+    *(labelData + i * labelWidth + 3) = 6;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 63, false, false);
+  real* priorData = priorValue->getData();
+  for (size_t i = 0; i < priorValue->getElementCnt() / 7; ++i) {
+    *priorData++ = std::rand() % 16;
+    *priorData++ = std::rand() % 16;
+    *priorData++ = std::rand() % 16 + std::rand() % 16;
+    *priorData++ = std::rand() % 16 + std::rand() % 16;
+    if (*(priorData - 2) < 16 && *(priorData - 1) < 16) {
+      *priorData++ = 1;
+    } else {
+      *priorData++ = -1;
+    }
+    *priorData++ = 16;
+    *priorData++ = 16;
+  }
+  priorData = priorValue->getData();
+  *(priorData + 0) = 1;
+  *(priorData + 1) = 2;
+  *(priorData + 2) = 6;
+  *(priorData + 3) = 7;
+  *(priorData + 4) = 1;
+  *(priorData + 5) = 16;
+  *(priorData + 6) = 16;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "anchorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 18, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rpn_loss", 1, false, useGpu, false);
+  }
+}
+
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
diff --git a/paddle/gserver/tests/test_Proposal.cpp b/paddle/gserver/tests/test_Proposal.cpp
new file mode 100644
index 0000000000000..28593363544d3
--- /dev/null
+++ b/paddle/gserver/tests/test_Proposal.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of priorBox layer and check to see if its output
+// matches the given result
+void doOneProposalTest(MatrixPtr& inputLoc,
+                       MatrixPtr& inputConf,
+                       MatrixPtr& inputAnchor,
+                       size_t feature_map_width,
+                       size_t feature_map_height,
+                       real nms_threshold,
+                       bool use_gpu,
+                       MatrixPtr& result) {
+  // Setting up the detection output layer
+  TestConfig configt;
+  configt.layerConfig.set_type("proposal");
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+
+  ProposalConfig* proposalConf = input->mutable_proposal_conf();
+  proposalConf->set_width(feature_map_width);
+  proposalConf->set_height(feature_map_height);
+  proposalConf->set_nms_threshold(nms_threshold);
+  proposalConf->set_confidence_threshold(0.01);
+  proposalConf->set_nms_top_k(20);
+  proposalConf->set_keep_top_k(10);
+  proposalConf->set_min_width(0);
+  proposalConf->set_min_height(0);
+  configt.inputDefs.push_back({INPUT_DATA_TARGET, "anchors", 28, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "anchors", 1, false, use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputAnchor);
+  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
+  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
+
+  // test layer initialize
+  bool store_FLAGS_use_gpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr proposalLayer;
+  initTestLayer(configt, &layerMap, &parameters, &proposalLayer);
+  FLAGS_use_gpu = store_FLAGS_use_gpu;
+  proposalLayer->forward(PASS_GC);
+  checkMatrixEqual(proposalLayer->getOutputValue(), result);
+}
+
+TEST(Layer, detectionOutputLayerFwd) {
+  bool useGpu = false;
+  // CPU case 1.
+  MatrixPtr inputLoc;
+  MatrixPtr inputConf;
+  MatrixPtr inputAnchor;
+  MatrixPtr result;
+  real nmsThreshold = 0.01;
+  real inputLocData[] = {0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1};
+  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
+  real inputAnchorData[] = {1, 1, 6, 6, 1, 16, 16, 1, 3, 6, 8, 1, 16, 16,
+                            1, 2, 6, 7, 1, 16, 16, 2, 1, 7, 6, 1, 16, 16};
+  real resultData[] = {0,
+                       1,
+                       0.68997448,
+                       3.18894059,
+                       3.18894059,
+                       5.01105940,
+                       5.01105940,
+                       0,
+                       1,
+                       0.64565631,
+                       3.18894059,
+                       5.18894059,
+                       5.01105940,
+                       7.01105940};
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputAnchor = Matrix::create(1, 28, false, useGpu);
+  result = Matrix::create(2, 7, false, useGpu);
+  inputLoc->setData(inputLocData);
+  inputConf->setData(inputConfData);
+  inputAnchor->setData(inputAnchorData);
+  result->setData(resultData);
+  doOneProposalTest(inputLoc,
+                    inputConf,
+                    inputAnchor,
+                    /* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    nmsThreshold,
+                    useGpu,
+                    result);
+
+#ifndef PADDLE_ONLY_CPU
+  // GPU case 1.
+  useGpu = true;
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputAnchor = Matrix::create(1, 28, false, useGpu);
+  inputLoc->copyFrom(inputLocData, 16);
+  inputConf->copyFrom(inputConfData, 8);
+  inputAnchor->copyFrom(inputAnchorData, 32);
+
+  nmsThreshold = 0.01;
+  result = Matrix::create(2, 7, false, useGpu);
+  result->copyFrom(resultData, 7);
+  doOneProposalTest(inputLoc,
+                    inputConf,
+                    inputAnchor,
+                    /* feature_map_width */ 1,
+                    /* feature_map_height */ 1,
+                    nmsThreshold,
+                    useGpu,
+                    result);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 37cd16c798907..9e934e022a734 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -289,6 +289,35 @@ message DetectionOutputConfig {
   optional uint32 width = 9 [default = 1];
 }
 
+message AnchorConfig {
+  required uint32 base_size = 1;
+  repeated float aspect_ratio = 2;
+  repeated float scale_ratio = 3;
+  optional uint32 feat_stride_x = 4 [default = 0];
+  optional uint32 feat_stride_y = 5 [default = 0];
+}
+
+message RPNLossConfig {
+  required float pos_overlap_threshold = 1;
+  required float neg_overlap_threshold = 2;
+  required uint32 rpn_batch_size = 3;
+  required float rpn_fg_ratio = 4;
+  required float loss_ratio = 5;
+  optional uint32 height = 6 [default = 1];
+  optional uint32 width = 7 [default = 1];
+}
+
+message ProposalConfig {
+  required float nms_threshold = 1;
+  required float confidence_threshold = 2;
+  required uint32 nms_top_k = 3;
+  required uint32 keep_top_k = 4;
+  required float min_width = 5;
+  required float min_height = 6;
+  optional uint32 height = 7 [default = 1];
+  optional uint32 width = 8 [default = 1];
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -309,6 +338,9 @@ message LayerInputConfig {
   optional RowConvConfig row_conv_conf = 15;
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
+  optional AnchorConfig anchor_conf = 18;
+  optional RPNLossConfig rpn_loss_conf = 19;
+  optional ProposalConfig proposal_conf = 20;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 826ba2834a820..2476aefcee0d7 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1724,6 +1724,63 @@ def __init__(self, name, inputs, size, input_num, num_classes,
         self.config.size = size
 
 
+@config_layer('anchor')
+class AnchorLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 inputs,
+                 base_size,
+                 aspect_ratio,
+                 scale_ratio,
+                 feat_stride_x=None,
+                 feat_stride_y=None):
+        super(AnchorLayer, self).__init__(name, 'anchor', 0, inputs)
+        config_assert(len(inputs) == 2, 'AnchorLayer must have 3 inputs')
+        self.config.inputs[0].anchor_conf.base_size = base_size
+        self.config.inputs[0].anchor_conf.aspect_ratio.extend(aspect_ratio)
+        self.config.inputs[0].anchor_conf.scale_ratio.extend(scale_ratio)
+        self.config.inputs[0].anchor_conf.feat_stride_x = feat_stride_x
+        self.config.inputs[0].anchor_conf.feat_stride_y = feat_stride_y
+
+
+@config_layer('rpn_loss')
+class RPNLossLayer(LayerBase):
+    def __init__(self, name, inputs, pos_overlap_threshold,
+                 neg_overlap_threshold, rpn_batch_size, rpn_fg_ratio,
+                 loss_ratio):
+        super(RPNLossLayer, self).__init__(name, 'rpn_loss', 0, inputs)
+        config_assert(len(inputs) == 4, 'RPNLossLayer must have 3 inputs')
+        self.config.inputs[
+            0].rpn_loss_conf.pos_overlap_threshold = pos_overlap_threshold
+        self.config.inputs[
+            0].rpn_loss_conf.neg_overlap_threshold = neg_overlap_threshold
+        self.config.inputs[0].rpn_loss_conf.rpn_batch_size = rpn_batch_size
+        self.config.inputs[0].rpn_loss_conf.rpn_fg_ratio = rpn_fg_ratio
+        self.config.inputs[0].rpn_loss_conf.loss_ratio = loss_ratio
+
+
+@config_layer('proposal')
+class ProposalLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 inputs,
+                 nms_threshold,
+                 confidence_threshold,
+                 nms_top_k,
+                 keep_top_k,
+                 min_width=0,
+                 min_height=0):
+        super(ProposalLayer, self).__init__(name, 'proposal', 0, inputs)
+        config_assert(len(inputs) == 3, 'ProposalLayer must have 3 inputs')
+        self.config.inputs[0].proposal_conf.nms_threshold = nms_threshold
+        self.config.inputs[
+            0].proposal_conf.confidence_threshold = confidence_threshold
+        self.config.inputs[0].proposal_conf.nms_top_k = nms_top_k
+        self.config.inputs[0].proposal_conf.keep_top_k = keep_top_k
+        self.config.inputs[0].proposal_conf.min_width = min_width
+        self.config.inputs[0].proposal_conf.min_height = min_height
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self, name, size, height=None, width=None, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b0524a507bace..3fe86900ccaca 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -117,6 +117,9 @@
     'cross_channel_norm_layer',
     'multibox_loss_layer',
     'detection_output_layer',
+    'anchor_layer',
+    'rpn_loss_layer',
+    'proposal_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -199,6 +202,9 @@ class LayerType(object):
     PRIORBOX_LAYER = 'priorbox'
     MULTIBOX_LOSS_LAYER = 'multibox_loss'
     DETECTION_OUTPUT_LAYER = 'detection_output'
+    ANCHOR_LAYER = 'anchor'
+    RPN_LOSS_LAYER = 'rpn_loss'
+    PROPOSAL_LAYER = 'proposal'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -1197,6 +1203,153 @@ def detection_output_layer(input_loc,
         name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
 
 
+@wrap_name_default("anchor")
+def anchor_layer(input,
+                 image,
+                 base_size,
+                 aspect_ratio,
+                 scale_ratio,
+                 feat_stride_x=None,
+                 feat_stride_y=None,
+                 name=None):
+    """
+    Generate the default anchor boxes. This layer is necessary for the 
+    Region Proposal Networks of Faster R-CNN.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param image: The network input image.
+    :type image: LayerOutput
+    :param base_size: The basic anchor size.
+    :type base_size: int
+    :param aspect_ratio: The aspect ratio used to generate anchors.
+    :type aspect_ratio: list
+    :param scale_ratio: The scales used to generate anchors.
+    :type scale_ratio: list
+    :param feat_stride_x: The spatial scale between the image and feature map.
+    :type feat_stride: int
+    :param feat_stride_y: The spatial scale between the image and feature map.
+    :type feat_stride: int
+    :return: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.ANCHOR_LAYER,
+        inputs=[input.name, image.name],
+        base_size=base_size,
+        aspect_ratio=aspect_ratio,
+        scale_ratio=scale_ratio,
+        feat_stride_x=feat_stride_x,
+        feat_stride_y=feat_stride_y)
+    return LayerOutput(name, LayerType.ANCHOR_LAYER, parents=[input, image])
+
+
+@wrap_name_default("rpn_loss")
+def rpn_loss_layer(input_loc,
+                   input_conf,
+                   anchors,
+                   label,
+                   pos_overlap_threshold=0.7,
+                   neg_overlap_threshold=0.3,
+                   rpn_batch_size=256,
+                   rpn_fg_ratio=0.5,
+                   loss_ratio=10,
+                   name=None):
+    """
+    Compute the location loss and the confidence loss for the 
+    Region Proposal Networks of Faster R-CNN.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict locations.
+    :type input_loc: LayerOutput
+    :param input_conf: The input anchor box confidence.
+    :type input_conf: LayerOutput
+    :param anchors: The input anchor boxes location.
+    :type anchors: LayerOutput
+    :param label: The input label.
+    :type label: LayerOutput
+    :param pos_overlap_threshold: The threshold of the overlap for foreground.
+    :type pos_overlap_threshold: float
+    :param neg_overlap_threshold: The threshold of the overlap for background.
+    :type neg_overlap_threshold: float
+    :param rpn_batch_size: The size of bbox batch for RPN training.
+    :type rpn_batch_size: int
+    :param rpn_fg_ratio: The ratio of the positive bbox in bbox batch
+    :type rpn_fg_ratio: float
+    :param loss_ratio: The ratio of location lossx to confidence loss.
+    :type loss_ratio: float
+    :return: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.RPN_LOSS_LAYER,
+        inputs=[anchors.name, label.name, input_loc.name, input_conf.name],
+        pos_overlap_threshold=pos_overlap_threshold,
+        neg_overlap_threshold=neg_overlap_threshold,
+        rpn_batch_size=rpn_batch_size,
+        rpn_fg_ratio=rpn_fg_ratio,
+        loss_ratio=loss_ratio)
+    return LayerOutput(
+        name,
+        LayerType.RPN_LOSS_LAYER,
+        parents=[anchors.name, label.name, input_loc.name, input_conf.name])
+
+
+@wrap_name_default("proposal")
+def proposal_layer(input_loc,
+                   input_conf,
+                   anchors,
+                   nms_threshold=0.45,
+                   confidence_threshold=0.01,
+                   nms_top_k=400,
+                   keep_top_k=200,
+                   min_width=16,
+                   min_height=16,
+                   name=None):
+    """
+    Apply the NMS to the output of RPN and compute the proposal location.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input_loc: The input predict locations.
+    :type input_loc: LayerOutput.
+    :param input_conf: The input anchor box confidence.
+    :type input_conf: LayerOutput.
+    :param anchors: The input anchor boxes location.
+    :type priorbox: LayerOutput
+    :param nms_threshold: The Non-maximum suppression threshold.
+    :type nms_threshold: float
+    :param confidence_threshold: The classification confidence threshold
+    :type confidence_threshold: float
+    :param nms_top_k: The bbox number kept of the NMS's output
+    :type nms_top_k: int
+    :param keep_top_k: The bbox number kept of the layer's output
+    :type keep_top_k: int
+    :param min_width: The proposal width threshold.
+    :type min_width: float
+    :param min_height: The proposal height threshold.
+    :type min_height: float
+    :return: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.PROPOSAL_LAYER,
+        inputs=[anchors.name, input_loc.name, input_conf.name],
+        nms_threshold=nms_threshold,
+        confidence_threshold=confidence_threshold,
+        nms_top_k=nms_top_k,
+        keep_top_k=keep_top_k,
+        min_width=min_width,
+        min_height=min_height)
+    return LayerOutput(
+        name,
+        LayerType.PROPOSAL_LAYER,
+        parents=[anchors, input_loc, input_conf])
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """