From 4d8992c3bc64a835aa6a1e6e12678594d3f117b5 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 18 Aug 2017 09:58:41 +0800
Subject: [PATCH 01/12] check format before set header format

---
 paddle/parameter/Parameter.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index e31cbc3dee6c5..08a426eb742b3 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -278,7 +278,11 @@ class Parameter {
   /**
    * @brief Set the format in header.
    */
-  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
 
   /**
    * @brief  Parameter Update Hook.

From 462b9b1d20942dca35dbe532248e53cdeccea6b2 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 18 Aug 2017 10:13:06 +0800
Subject: [PATCH 02/12] update mkldnn tag v0.10

---
 cmake/external/mkldnn.cmake | 2 +-
 cmake/external/mklml.cmake  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 25c6b4ef52d3f..9686df0021900 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index e9fd3d4bedc98..51fafb94791dd 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

From 62e6dac402ca63b402b5dfd1d7649cba1e258d41 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 18 Aug 2017 14:30:09 +0800
Subject: [PATCH 03/12] add MKLDNNMatrix files

---
 paddle/gserver/layers/MKLDNNLayer.h |  1 +
 paddle/math/CMakeLists.txt          | 15 ++++++++++
 paddle/math/MKLDNNMatrix.cpp        | 19 ++++++++++++
 paddle/math/MKLDNNMatrix.h          | 45 +++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+)
 create mode 100644 paddle/math/MKLDNNMatrix.cpp
 create mode 100644 paddle/math/MKLDNNMatrix.h

diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 63e29f447eede..9533027fa6c75 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "Layer.h"
 #include "MKLDNNBase.h"
 #include "mkldnn.hpp"
+#include "paddle/math/MKLDNNMatrix.h"
 
 DECLARE_bool(use_mkldnn);
 DECLARE_bool(use_mkldnn_wgt);
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index bf28092e82b77..ad6de18c81d60 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -14,6 +14,21 @@
 #
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
+
+message(STATUS "----------MATH_HEADERS:${MATH_HEADERS}")
+message(STATUS "----------MATH_SOURCES:${MATH_SOURCES}")
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    message(STATUS "----------DNN_HEADER:${DNN_HEADER}")
+    message(STATUS "----------DNN_SOURCES:${DNN_SOURCES}")
+    list(REMOVE_ITEM MATH_HEADERS ${DNN_HEADER})
+    list(REMOVE_ITEM MATH_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
 set(MATH_SOURCES
     "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
     "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
new file mode 100644
index 0000000000000..df8e72d78bedd
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -0,0 +1,19 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNMatrix.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000000..91ef56f2c3476
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+//#include "Matrix.h"
+#include "Vector.h"
+
+#include "mkldnn.hpp"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+static const std::map<mkldnn::memory::format, PARAM_FORMAT> PARAM_FOARMAT_MAP =
+    {{mkldnn::memory::format::oi, PARAM_FORMAT_MKLDNN_OI}};
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuVector {
+public:
+  explicit MKLDNNMatrix(size_t size, int fmt) : CpuVector(size), fmt_(fmt) {}
+
+  ~MKLDNNMatrix() {}
+
+protected:
+  int fmt_;
+};
+
+}  // namespace paddle

From 4bffbd30f0dbc2a2bbff4aa8108867fceecc260a Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 21 Aug 2017 16:44:30 +0800
Subject: [PATCH 04/12] use MKLDNNMatrix in fc forward

---
 paddle/gserver/layers/Layer.cpp         |  2 +-
 paddle/gserver/layers/Layer.h           | 20 +++++++-
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 63 ++++++++++++++++---------
 paddle/gserver/layers/MKLDNNLayer.h     | 25 +++++++---
 paddle/math/CMakeLists.txt              |  4 --
 paddle/math/MKLDNNMatrix.cpp            | 29 +++++++++++-
 paddle/math/MKLDNNMatrix.h              | 43 +++++++++++++----
 7 files changed, 143 insertions(+), 43 deletions(-)

diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index d5621412caee8..2bc20eee6c452 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -41,7 +41,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
     : config_(config),
       useGpu_(useGpu),
-      deviceId_(-1),
+      deviceId_(CPU_DEVICE),
       needSequenceInfo_(true) {}
 
 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 0ed482889d0ce..ec4d093e0cac9 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -59,7 +59,12 @@ class Layer {
   LayerConfig config_;
   /// whether to use GPU
   bool useGpu_;
-  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  /// Paddle device ID, MKLDNN is -2, CPU is -1
+  enum PADDLE_DEVICE_ID {
+    MKLDNN_DEVICE = -2,
+    CPU_DEVICE = -1,
+  };
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
   int deviceId_;
   /// Input layers
   std::vector<LayerPtr> inputLayers_;
@@ -321,6 +326,19 @@ class Layer {
     if (deviceId == getDeviceId()) {
       return output_;
     } else {
+      bool CPU2MKLDNN =
+          getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE;
+      bool MKLDNN2CPU =
+          getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE;
+      if (CPU2MKLDNN) {
+        // TODO: do something
+        return output_;
+      } else if (MKLDNN2CPU) {
+        // TODO: do something
+        return output_;
+      }
+
+      // TODO: handle mkldnn device or add mkldnn device to other
       for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
         if (outputOtherDevice_[i].deviceId == deviceId) {
           return outputOtherDevice_[i];
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d201fac65e045..fac0390eee501 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -135,33 +135,51 @@ void MKLDNNFcLayer::reshape() {
 
 void MKLDNNFcLayer::resetFwd() {
   bool hasBias = biases_ && biases_->getW();
-  real* iData = getInputValue(0)->getData();
-  real* oData = getOutputValue()->getData();
-  real* wData = weight_->getW()->getData();
-  real* bData = hasBias ? biases_->getW()->getData() : NULL;
+  const MatrixPtr& in = getInputValue(0);
+  const MatrixPtr& wgt = weight_->getW();
+  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
+  const MatrixPtr& out = output_.value;
+
+  if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) {
+    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
+    CHECK(inVal_) << "Input should be MKLDNNMatrix";
+    // TODO:  change input nchw to nc if available
+    // inVal_->downSpatial()
+  } else {
+    inVal_ = MKLDNNMatrix::create(
+        in,
+        hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_},
+        hasSpatial_ ? format::nchw : format::nc,
+        engine_);
+  }
 
-  // TODO(TJ): below create should be covered in MkldnnMatrix
-  // create memory desc
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  wgtVal_ = MKLDNNMatrix::create(
+      wgt,
+      hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
+      hasSpatial_ ? format::oihw : format::oi,
+      engine_);
 
-  // create memory primitive desc and memory self
-  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
-  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+  biasVal_ =
+      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+
+  outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+
+  // change original output to mkldnn output
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 
+  // create forward handle
   prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
-                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::desc fwdDesc =
+      hasBias ? fc_fwd::desc(pk,
+                             inVal_->getMD(),
+                             wgtVal_->getMD(),
+                             biasVal_->getMD(),
+                             outVal_->getMD())
+              : fc_fwd::desc(
+                    pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
 
-  if (bData != NULL) {
-    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+  if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
   } else {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
@@ -197,7 +215,8 @@ void MKLDNNFcLayer::resetBwd() {
     // update data
     inVal_->set_data_handle(iData);
   } else {
-    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+    LOG(FATAL) << "Should not be empty";
+    // inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
   }
 
   // create memory primitive desc and memory self
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 9533027fa6c75..b44095befb66a 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/math/MKLDNNMatrix.h"
 
 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);
 
 namespace paddle {
 
@@ -54,13 +53,14 @@ class MKLDNNLayer : public Layer {
   std::vector<mkldnn::primitive> pipelineBwd_;
 
   // TODO(TJ): change below memory as MKLDNNMatrixPtr type
-  std::shared_ptr<mkldnn::memory> inVal_;
+  // MKLDNNMatrixPtr ;
+  MKLDNNMatrixPtr inVal_;
   std::shared_ptr<mkldnn::memory> inGrad_;
-  std::shared_ptr<mkldnn::memory> outVal_;
+  MKLDNNMatrixPtr outVal_;
   std::shared_ptr<mkldnn::memory> outGrad_;
-  std::shared_ptr<mkldnn::memory> wgtVal_;
+  MKLDNNMatrixPtr wgtVal_;
   std::shared_ptr<mkldnn::memory> wgtGrad_;
-  std::shared_ptr<mkldnn::memory> biasVal_;
+  MKLDNNMatrixPtr biasVal_;
   std::shared_ptr<mkldnn::memory> biasGrad_;
 
 public:
@@ -94,7 +94,7 @@ class MKLDNNLayer : public Layer {
     stream_.reset(new MKLDNNStream());
     engine_ = CPUEngine::Instance().getEngine();
 
-    // TODO(TJ): deivecId
+    setDeviceID(MKLDNN_DEVICE);
     return true;
   }
 
@@ -128,6 +128,19 @@ class MKLDNNLayer : public Layer {
     // TODO(TJ): isFmtSuppoted(fmt)
     return mkldnn::memory::desc(dims, type, fmt);
   }
+
+  void resetMKLDNNOutput(size_t height, size_t width) {
+    Layer::resetOutput(height, width);
+    // get valu and grad, use mkldnn matrix instaed
+    // output_.value;
+  }
+
+protected:
+  void setDeviceID(int id) {
+    deviceId_ = id;
+    output_.deviceId = id;
+    // TODO: handle mkldnn device or add mkldnn device to other
+  }
 };
 
 }  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index ad6de18c81d60..8afe6b509d24a 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -15,13 +15,9 @@
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 
-message(STATUS "----------MATH_HEADERS:${MATH_HEADERS}")
-message(STATUS "----------MATH_SOURCES:${MATH_SOURCES}")
 if(NOT WITH_MKLDNN)
     file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
     file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
-    message(STATUS "----------DNN_HEADER:${DNN_HEADER}")
-    message(STATUS "----------DNN_SOURCES:${DNN_SOURCES}")
     list(REMOVE_ITEM MATH_HEADERS ${DNN_HEADER})
     list(REMOVE_ITEM MATH_SOURCES ${DNN_SOURCES})
     message(STATUS "Skip compiling with MKLDNNMatrix")
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index df8e72d78bedd..44fc54278c993 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -16,4 +16,31 @@ limitations under the License. */
 
 using namespace mkldnn;  // NOLINT
 
-namespace paddle {}  // namespace paddle
+namespace paddle {
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
+  CpuMatrixPtr cpuM = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuM) << "Only support create from CPU matrix yet";
+
+  size_t ndims = dims.size();
+  CHECK(ndims > 0) << "Input dims should not be empty";
+  size_t cnt = 1;
+  for (size_t i = 0; i < ndims; ++i) {
+    cnt *= dims[i];
+  }
+  CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match";
+
+  size_t width = m->getWidth();
+  size_t height = m->getHeight();
+  real* data = m->getData();
+
+  memory::desc md = memory::desc(dims, dtype, fmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, eg);
+  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 91ef56f2c3476..73eb50d2a0b3a 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-//#include "Matrix.h"
-#include "Vector.h"
-
+#include <vector>
+#include "Matrix.h"
 #include "mkldnn.hpp"
 #include "paddle/parameter/Parameter.h"
 
@@ -32,14 +31,42 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  * @brief MKLDNN Matrix.
  *
  */
-class MKLDNNMatrix : public CpuVector {
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
-  explicit MKLDNNMatrix(size_t size, int fmt) : CpuVector(size), fmt_(fmt) {}
+  MKLDNNMatrix(real* data,
+               size_t height,
+               size_t width,
+               mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
 
-  ~MKLDNNMatrix() {}
+  MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(height, width, false), mkldnn::memory(pd) {
+    set_data_handle(CpuMatrix::getData());
+  }
+
+  static MKLDNNMatrixPtr create(
+      const MatrixPtr& m,
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+  /**
+   * Get primitive descriptor
+   */
+  mkldnn::memory::primitive_desc getPD() { return this->get_primitive_desc(); }
 
-protected:
-  int fmt_;
+  /**
+   * Get memory descriptor
+   */
+  mkldnn::memory::desc getMD() { return getPD().desc(); }
+
+  /**
+   * Get format
+   */
+  int getFormat() { return getMD().data.format; }
+
+  ~MKLDNNMatrix() {}
 };
 
 }  // namespace paddle

From 4eecd0c2d531f66e64eebff88a99488275143207 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 22 Aug 2017 14:18:16 +0800
Subject: [PATCH 05/12] use MKLDNNMatrix in fc backward

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 77 ++++++++++++-------------
 paddle/gserver/layers/MKLDNNLayer.h     | 59 ++++++++++++++-----
 paddle/math/MKLDNNMatrix.h              | 33 +++++++++--
 3 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index fac0390eee501..5463104469632 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -158,10 +158,8 @@ void MKLDNNFcLayer::resetFwd() {
       hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
       hasSpatial_ ? format::oihw : format::oi,
       engine_);
-
   biasVal_ =
       hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
-
   outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
 
   // change original output to mkldnn output
@@ -193,46 +191,41 @@ void MKLDNNFcLayer::resetBwd() {
     return;
   }
   needResetBwd_ = false;
-
   bool hasBias = biases_ && biases_->getWGrad();
-  real* iData = getInputValue(0)->getData();
-  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
-  real* oDiff = getOutputGrad()->getData();
-  real* wDiff = weight_->getWGrad()->getData();
-  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
 
   /// backward weight
-  // create memory desc for backward memory
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
-  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-
-  if (inVal_) {
-    // update data
-    inVal_->set_data_handle(iData);
-  } else {
-    LOG(FATAL) << "Should not be empty";
-    // inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  }
-
-  // create memory primitive desc and memory self
-  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
-  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
+  CHECK(inVal_) << "Should have input value";
+  const MatrixPtr& wgt = weight_->getWGrad();
+  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
+  const MatrixPtr& out = output_.grad;
+
+  wgtGrad_ = MKLDNNMatrix::create(
+      wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_);
+  biasGrad_ =
+      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
 
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+  // change original output to mkldnn output
+  // TODO: right?
+  output_.grad = std::dynamic_pointer_cast<Matrix>(outGrad_);
+
+  // create memory primitive desc
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+                                      inVal_->getMD(),
+                                      wgtGrad_->getMD(),
+                                      outGrad_->getMD());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
-                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
-                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::desc bwdWgtDesc =
+      hasBias ? fc_bwdWgt::desc(inVal_->getMD(),
+                                wgtGrad_->getMD(),
+                                biasGrad_->getMD(),
+                                outGrad_->getMD())
+              : fc_bwdWgt::desc(
+                    inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
   fc_bwdWgt::primitive_desc bwdWgtPD =
       fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
 
-  if (bDiff != NULL) {
-    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+  if (hasBias) {
     bwdWgt_.reset(
         new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
   } else {
@@ -242,13 +235,19 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  if (iDiff == NULL) {
+  const MatrixPtr& in = getInputGrad(0);
+  if (in == nullptr) {
     return;
   }
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  fc_bwdData::desc bwdDataDesc =
+      fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+
+  // TODO: check right, just from ingrad?
+  inGrad_ =
+      MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_);
+
   CHECK(wgtVal_) << "Should have weight memory";
   bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
   pipelineBwd_.push_back(*bwdData_);
@@ -264,7 +263,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
     // update input data
     // since it might be changed if this is after data layer
     real* iData = getInputValue(0)->getData();
-    inVal_->set_data_handle(iData);
+    inVal_->updateData(iData);
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -288,7 +287,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
 
     // update diff
     real* oDiff = getOutputGrad()->getData();
-    outGrad_->set_data_handle(oDiff);
+    outGrad_->updateData(oDiff);
 
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index b44095befb66a..fbd62d9aaa306 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -52,16 +52,15 @@ class MKLDNNLayer : public Layer {
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
-  // MKLDNNMatrixPtr ;
+  // MKLDNNMatrixPtr
   MKLDNNMatrixPtr inVal_;
-  std::shared_ptr<mkldnn::memory> inGrad_;
+  MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
-  std::shared_ptr<mkldnn::memory> outGrad_;
+  MKLDNNMatrixPtr outGrad_;
   MKLDNNMatrixPtr wgtVal_;
-  std::shared_ptr<mkldnn::memory> wgtGrad_;
+  MKLDNNMatrixPtr wgtGrad_;
   MKLDNNMatrixPtr biasVal_;
-  std::shared_ptr<mkldnn::memory> biasGrad_;
+  MKLDNNMatrixPtr biasGrad_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
@@ -84,17 +83,24 @@ class MKLDNNLayer : public Layer {
 
   virtual bool init(const LayerMap& layerMap,
                     const ParameterMap& parameterMap) {
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    if (useGpu_ == true) {
+      LOG(WARNING) << "Do not support GPU yet, will change to useGpu = false";
+      useGpu_ = false;
+    }
+
+    // set device id before Layer::init
+    setDevice(MKLDNN_DEVICE);
+    // change param device to MKLDNN device
+    setParamsDevice(MKLDNN_DEVICE, parameterMap);
     if (!Layer::init(layerMap, parameterMap)) {
       return false;
     }
 
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
     stream_.reset(new MKLDNNStream());
     engine_ = CPUEngine::Instance().getEngine();
-
-    setDeviceID(MKLDNN_DEVICE);
     return true;
   }
 
@@ -136,10 +142,33 @@ class MKLDNNLayer : public Layer {
   }
 
 protected:
-  void setDeviceID(int id) {
-    deviceId_ = id;
-    output_.deviceId = id;
-    // TODO: handle mkldnn device or add mkldnn device to other
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
   }
 };
 
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 73eb50d2a0b3a..54c0a1fdcbc47 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -44,6 +44,8 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
     set_data_handle(CpuMatrix::getData());
   }
 
+  ~MKLDNNMatrix() {}
+
   static MKLDNNMatrixPtr create(
       const MatrixPtr& m,
       mkldnn::memory::dims dims,
@@ -52,21 +54,42 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
   /**
-   * Get primitive descriptor
+   * Get primitive descriptor.
    */
   mkldnn::memory::primitive_desc getPD() { return this->get_primitive_desc(); }
 
   /**
-   * Get memory descriptor
+   * Get memory descriptor.
    */
   mkldnn::memory::desc getMD() { return getPD().desc(); }
 
   /**
-   * Get format
+   * Get dims.
    */
-  int getFormat() { return getMD().data.format; }
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::dims dst;
+    int* src = getMD().data.dims;
+    int ndims = getMD().data.ndims;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
 
-  ~MKLDNNMatrix() {}
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMD().data.format);
+  }
+
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
 };
 
 }  // namespace paddle

From 48d87e5e912ad084ccc63dae8649f90a3f0989ba Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 23 Aug 2017 16:47:51 +0800
Subject: [PATCH 06/12] pass test, support input CPU device

---
 paddle/gserver/layers/Layer.h           |  35 +++++---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 108 +++++++++++++++---------
 paddle/gserver/layers/MKLDNNLayer.h     |  81 +++++++++++++++---
 paddle/math/Allocator.h                 |   6 ++
 paddle/math/MKLDNNMatrix.cpp            |  71 +++++++++++++---
 paddle/math/MKLDNNMatrix.h              |  49 ++++++++---
 6 files changed, 258 insertions(+), 92 deletions(-)

diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index ec4d093e0cac9..edef36194aabd 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -82,6 +82,7 @@ class Layer {
   Argument output_;
   /// Several outputs stored on different devices, used in 'parallel_nn' case,
   /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
   std::map<std::string, Argument*> outputMap_;
@@ -177,6 +178,13 @@ class Layer {
     return inputLayer.getOutput(deviceId_);
   }
 
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
   /**
    * Get the forward-input value.
    */
@@ -191,6 +199,13 @@ class Layer {
     return inputLayer.getOutput(deviceId_).value;
   }
 
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
   /**
    * Get the forward-input grad.
    */
@@ -205,6 +220,13 @@ class Layer {
     return inputLayer.getOutput(deviceId_).grad;
   }
 
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
   /**
    * Get the forward-input label.
    */
@@ -326,19 +348,6 @@ class Layer {
     if (deviceId == getDeviceId()) {
       return output_;
     } else {
-      bool CPU2MKLDNN =
-          getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE;
-      bool MKLDNN2CPU =
-          getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE;
-      if (CPU2MKLDNN) {
-        // TODO: do something
-        return output_;
-      } else if (MKLDNN2CPU) {
-        // TODO: do something
-        return output_;
-      }
-
-      // TODO: handle mkldnn device or add mkldnn device to other
       for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
         if (outputOtherDevice_[i].deviceId == deviceId) {
           return outputOtherDevice_[i];
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 5463104469632..a3291e6a8fb75 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -97,7 +97,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
 }
 
 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
   if (bs_ == batchSize) {
     return;
@@ -135,35 +135,43 @@ void MKLDNNFcLayer::reshape() {
 
 void MKLDNNFcLayer::resetFwd() {
   bool hasBias = biases_ && biases_->getW();
-  const MatrixPtr& in = getInputValue(0);
   const MatrixPtr& wgt = weight_->getW();
   const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
   const MatrixPtr& out = output_.value;
 
-  if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) {
+  if (prevIsMKLDNN()) {
+    const MatrixPtr& in = getInputValue(0);
     inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
     CHECK(inVal_) << "Input should be MKLDNNMatrix";
-    // TODO:  change input nchw to nc if available
-    // inVal_->downSpatial()
   } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
     inVal_ = MKLDNNMatrix::create(
-        in,
-        hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_},
-        hasSpatial_ ? format::nchw : format::nc,
-        engine_);
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
   }
-
+  inVal_->downSpatial();
   wgtVal_ = MKLDNNMatrix::create(
-      wgt,
-      hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
-      hasSpatial_ ? format::oihw : format::oi,
-      engine_);
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+  wgtVal_->downSpatial();
   biasVal_ =
       hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
   outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
 
-  // change original output to mkldnn output
+  // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!nextIsMKLDNN()) {
+    Argument cpuOutput;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        cpuOutput = outputOtherDevice_[i];
+      }
+    }
+    cpuOutput.setFrameHeight(output_.getFrameHeight());
+    cpuOutput.setFrameWidth(output_.getFrameWidth());
+
+    // fc cpu output value do not need convert
+    cpuOutput.value = output_.value;
+  }
 
   // create forward handle
   prop_kind pk = prop_kind::forward;
@@ -176,12 +184,13 @@ void MKLDNNFcLayer::resetFwd() {
               : fc_fwd::desc(
                     pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-
   if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
   } else {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
   }
+  printValueFormatFlow();
+
   pipelineFwd_.clear();
   pipelineFwd_.push_back(*fwd_);
 }
@@ -197,17 +206,24 @@ void MKLDNNFcLayer::resetBwd() {
   CHECK(inVal_) << "Should have input value";
   const MatrixPtr& wgt = weight_->getWGrad();
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
-  const MatrixPtr& out = output_.grad;
 
-  wgtGrad_ = MKLDNNMatrix::create(
-      wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_);
-  biasGrad_ =
-      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+  if (nextIsMKLDNN()) {
+    // can not directly cast outputgrad to mkldnnmatrix,
+    // since each layer can not write the inputgrad to mkldnn inputgrad.
+    // So just create from matrix with outputvalue format.
+    const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    // TODO: maybe need merge topdiffs
+  } else {
+    // TODO: merge topdiffs
+    const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
+    // fc do not need to convert from cpu device since output always nc
+    // only need create from cpu device
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+  }
 
-  outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
-  // change original output to mkldnn output
-  // TODO: right?
-  output_.grad = std::dynamic_pointer_cast<Matrix>(outGrad_);
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
 
   // create memory primitive desc
   fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
@@ -235,21 +251,38 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  const MatrixPtr& in = getInputGrad(0);
-  if (in == nullptr) {
-    return;
+  if (prevIsMKLDNN()) {
+    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+      // TODO: many mkldnn bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
+  } else {
+    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
+      // TODO: many  bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
   }
+
   fc_bwdData::desc bwdDataDesc =
       fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
 
-  // TODO: check right, just from ingrad?
-  inGrad_ =
-      MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_);
-
   CHECK(wgtVal_) << "Should have weight memory";
   bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
   pipelineBwd_.push_back(*bwdData_);
 }
 
@@ -259,11 +292,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->updateData(iData);
+    syncInputValue();
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -285,10 +314,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
     resetBwd();
 
-    // update diff
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->updateData(oDiff);
-
+    syncOutputGrad();
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
   }
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index fbd62d9aaa306..3dd17a36ff7ee 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -125,23 +125,80 @@ class MKLDNNLayer : public Layer {
                        << ", oh: " << oh_ << ", ow: " << ow_;
   }
 
-  // TODO(TJ): move to MkldnnMatrix
-  // create memory desc
-  inline mkldnn::memory::desc createMD(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
-    // TODO(TJ): isFmtSuppoted(fmt)
-    return mkldnn::memory::desc(dims, type, fmt);
+  /**
+   * Print the mkldnn memory format flow of value
+   */
+  virtual void printValueFormatFlow() {
+    if (inVal_ && outVal_) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
+                        << " >>> " << outVal_->getFormat();
+    }
   }
 
-  void resetMKLDNNOutput(size_t height, size_t width) {
-    Layer::resetOutput(height, width);
-    // get valu and grad, use mkldnn matrix instaed
-    // output_.value;
+  /**
+   * Print the mkldnn memory format flow of grad
+   */
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
   }
 
 protected:
+  /**
+   * If next layer only has MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool nextIsMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
+  /**
+   * Is previous layer MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool prevIsMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (prevIsMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (nextIsMKLDNN()) {
+      return;
+    }
+
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
+
   /**
    * Set deviceId of this layer.
    */
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 666a8b8368e3e..94ef561f066a1 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ class CpuAllocator : public Allocator {
    */
   virtual void* alloc(size_t size) {
     void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
     CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
     CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
     return ptr;
   }
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 44fc54278c993..24d54ec0f7313 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,29 +18,74 @@ using namespace mkldnn;  // NOLINT
 
 namespace paddle {
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m,
-                                     memory::dims dims,
-                                     memory::format fmt,
-                                     engine& eg,
-                                     mkldnn::memory::data_type dtype) {
-  CpuMatrixPtr cpuM = std::dynamic_pointer_cast<CpuMatrix>(m);
-  CHECK(cpuM) << "Only support create from CPU matrix yet";
-
-  size_t ndims = dims.size();
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
   CHECK(ndims > 0) << "Input dims should not be empty";
-  size_t cnt = 1;
+  size_t cnts = 1;
   for (size_t i = 0; i < ndims; ++i) {
-    cnt *= dims[i];
+    cnts *= dims[i];
   }
-  CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match";
 
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    // LOG(INFO) << height << "," << width;
+    m = Matrix::create(height, width, false, false);
+  }
+
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
   size_t width = m->getWidth();
   size_t height = m->getHeight();
   real* data = m->getData();
+  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+}
 
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
   memory::desc md = memory::desc(dims, dtype, fmt);
   memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return create(m, pd);
+}
+
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+
+  memory::dims srcDims = getDims();
+  const int H = 2, W = 3;
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  void* data = getData();
+  memory(pd, data);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54c0a1fdcbc47..05adc867c2076 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -39,20 +39,37 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
                mkldnn::memory::primitive_desc pd)
       : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
 
-  MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(height, width, false), mkldnn::memory(pd) {
-    set_data_handle(CpuMatrix::getData());
-  }
-
   ~MKLDNNMatrix() {}
 
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
   static MKLDNNMatrixPtr create(
-      const MatrixPtr& m,
+      MatrixPtr m,
       mkldnn::memory::dims dims,
       mkldnn::memory::format fmt,
       mkldnn::engine& eg,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+public:
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
+
   /**
    * Get primitive descriptor.
    */
@@ -64,12 +81,13 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
   mkldnn::memory::desc getMD() { return getPD().desc(); }
 
   /**
-   * Get dims.
+   * Get dimensions.
    */
   mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMD();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
     mkldnn::memory::dims dst;
-    int* src = getMD().data.dims;
-    int ndims = getMD().data.ndims;
     dst.resize(ndims);
     for (int i = 0; i < ndims; ++i) {
       dst[i] = src[i];
@@ -85,11 +103,16 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
   }
 
   /**
-   * Update the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
+   * Get memory data type.
    */
-  void updateData(void* data) { set_data_handle(data); }
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMD().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPD().get_engine(); }
 };
 
 }  // namespace paddle

From 780c8d969e0d2d220df19a672c141ff7c44f53d2 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 23 Aug 2017 17:03:16 +0800
Subject: [PATCH 07/12] make downSpatial work, and remove hasSpatial_

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 4 ----
 paddle/gserver/layers/MKLDNNFcLayer.h   | 5 +----
 paddle/math/MKLDNNMatrix.cpp            | 7 ++++++-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index a3291e6a8fb75..a5555c4618aa3 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -111,10 +111,6 @@ void MKLDNNFcLayer::reshape() {
   if (iw_ == 0) {
     iw_ = 1;
   }
-  hasSpatial_ = true;
-  if (ih_ == 1 && iw_ == 1) {
-    hasSpatial_ = false;
-  }
   CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
   ic_ = iLayerSize_ / (ih_ * iw_);
   CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 7954852a23f81..e2657a8d5e9d9 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -32,16 +32,13 @@ class MKLDNNFcLayer : public MKLDNNLayer {
   // if has already init the weight
   bool hasInitedWgt_;
 
-  // if input layer has image size info (ih>1 && iw>1)
-  bool hasSpatial_;
-
   // fc weight and bias
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
 public:
   explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}
 
   ~MKLDNNFcLayer() {}
 
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 24d54ec0f7313..94df9c155084c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -85,7 +85,12 @@ void MKLDNNMatrix::downSpatial() {
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
   void* data = getData();
-  memory(pd, data);
+  mkldnn_primitive_t result;
+  mkldnn::error::wrap_c_api(
+      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+      "could not create a memory primitive");
+  reset(result);
+  set_data_handle(data);
 }
 
 }  // namespace paddle

From 4cc57836f393ada9b65cfeef444662afc34f1109 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 25 Aug 2017 17:20:28 +0800
Subject: [PATCH 08/12] enable reorder

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 39 +++++------------
 paddle/math/MKLDNNMatrix.cpp            | 57 +++++++++++++++++++++++++
 paddle/math/MKLDNNMatrix.h              | 33 ++++++++++++--
 3 files changed, 97 insertions(+), 32 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index a5555c4618aa3..ad50c15a7dc70 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -61,39 +61,20 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
     return;
   }
 
-  // TODO(TJ): dst format should get from wgtVal_
-  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
-  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-
-  // The weight_ is transposed from initial paddle weight
-  MatrixPtr paddleWgt = Matrix::create(
-      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
-
-  // TODO(TJ): remove this print when do not need differ weights
-  std::ostringstream ostr;
-  paddleWgt->print(ostr);
-  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
-
-  // The mkldnn weight is transposed from initial paddle matrix
-  MatrixPtr paddleWgtT;
-  paddleWgt->transpose(paddleWgtT, true);
-  weight_->getW()->copyFrom(*paddleWgtT);
-  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
 
 void MKLDNNFcLayer::convertWeightsToPaddle() {
-  MatrixPtr dnnWgt = weight_->getW();
-  MatrixPtr paddleWgt;
-  dnnWgt->transpose(paddleWgt, true);
-
-  // copy paddle weight and override on weight_
-  MatrixPtr dnnWgtT = Matrix::create(
-      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
-  dnnWgtT->copyFrom(*paddleWgt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
 void MKLDNNFcLayer::reshape() {
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 94df9c155084c..32ae3b1bcf76a 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -56,6 +56,63 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
   return create(m, pd);
 }
 
+void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
+                                   memory::format srcFmt,
+                                   memory::dims targetDim) {
+  memory::format dstFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  real* srcData = getData();
+  real* dstData = m->getData();
+  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
+                                 memory::format dstFmt,
+                                 memory::dims targetDim) {
+  memory::format srcFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  real* srcData = getData();
+  real* dstData = m->getData();
+  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderOnce(void* srcData,
+                               void* dstData,
+                               memory::format srcFmt,
+                               memory::format dstFmt,
+                               memory::dims dm) {
+  CHECK(srcData);
+  CHECK(dstData);
+  MatrixPtr tmpSrc;
+  if (dstData == srcData) {
+    // inplace data
+    size_t sz = 1;
+    for (size_t i = 0; i < dm.size(); ++i) {
+      sz *= dm[i];
+    }
+    tmpSrc = Matrix::create(sz, 1, false, false);
+    tmpSrc->copyFrom((real*)srcData, sz);
+    srcData = tmpSrc->getData();
+  }
+
+  auto dtype = this->getDtype();
+  auto srcMD = memory::desc(dm, dtype, srcFmt);
+  auto dstMD = memory::desc(dm, dtype, dstFmt);
+
+  auto eg = this->getEngine();
+  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
+  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
+
+  auto r = reorder(src, dst);
+  stream(stream::kind::eager).submit({r}).wait();
+}
+
 void MKLDNNMatrix::downSpatial() {
   int fmt = getFormat();
   if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 05adc867c2076..ea3fd7d461cf8 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -21,9 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 
-static const std::map<mkldnn::memory::format, PARAM_FORMAT> PARAM_FOARMAT_MAP =
-    {{mkldnn::memory::format::oi, PARAM_FORMAT_MKLDNN_OI}};
-
 class MKLDNNMatrix;
 typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
 
@@ -57,6 +54,26 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
 public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder
+   * Pay attention: this function would only reorder the data layout.
+   *                will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder
+   * Pay attention: this function would only reorder the data layout.
+   *                will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
   /**
    * Dimensionality reduction.
    * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
@@ -113,6 +130,16 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
    * Get engine.
    */
   mkldnn::engine getEngine() { return getPD().get_engine(); }
+
+protected:
+  /**
+   * Do once reorder supported inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
 };
 
 }  // namespace paddle

From 98b7c6736445de1f287156e933b0d625f648e6da Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 28 Aug 2017 09:52:58 +0800
Subject: [PATCH 09/12] add todo

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index ad50c15a7dc70..d38e6a2099e52 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -184,15 +184,14 @@ void MKLDNNFcLayer::resetBwd() {
   const MatrixPtr& wgt = weight_->getWGrad();
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
 
+  // TODO(TJ): merge topdiffs
   if (nextIsMKLDNN()) {
     // can not directly cast outputgrad to mkldnnmatrix,
     // since each layer can not write the inputgrad to mkldnn inputgrad.
     // So just create from matrix with outputvalue format.
     const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
     outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
-    // TODO: maybe need merge topdiffs
   } else {
-    // TODO: merge topdiffs
     const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
     // fc do not need to convert from cpu device since output always nc
     // only need create from cpu device
@@ -234,8 +233,7 @@ void MKLDNNFcLayer::resetBwd() {
       return;
     }
     if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-      // TODO: many mkldnn bots
-      // add sum handle
+      // TODO(TJ): use outputMaps_ ways when merge topdiff done
     } else {
       inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
     }
@@ -245,8 +243,7 @@ void MKLDNNFcLayer::resetBwd() {
       return;
     }
     if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
-      // TODO: many  bots
-      // add sum handle
+      // TODO(TJ): use outputMaps_ ways when merge topdiff done
     } else {
       inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
     }

From fe51f726a2da85b0cb96734bd9b156760b044cf9 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 28 Aug 2017 10:44:31 +0800
Subject: [PATCH 10/12] fix cmake

---
 paddle/math/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 8afe6b509d24a..68b5296228cd7 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -16,10 +16,10 @@ file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 
 if(NOT WITH_MKLDNN)
-    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
-    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
-    list(REMOVE_ITEM MATH_HEADERS ${DNN_HEADER})
-    list(REMOVE_ITEM MATH_SOURCES ${DNN_SOURCES})
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
     message(STATUS "Skip compiling with MKLDNNMatrix")
 else()
     message(STATUS "Compile with MKLDNNMatrix")

From bfbd066fdd1c4a81266864bf837d89742b3f2ad6 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 29 Aug 2017 19:55:44 +0800
Subject: [PATCH 11/12] refine

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 117 ++++++++++++------------
 paddle/gserver/layers/MKLDNNFcLayer.h   |   2 +
 paddle/gserver/layers/MKLDNNLayer.h     |  48 +++++++---
 paddle/math/MKLDNNMatrix.cpp            |  25 ++---
 paddle/math/MKLDNNMatrix.h              |  29 +++---
 5 files changed, 118 insertions(+), 103 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d38e6a2099e52..a08cca318e5ff 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,6 +77,24 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
+}
+
 void MKLDNNFcLayer::reshape() {
   const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
@@ -116,7 +134,7 @@ void MKLDNNFcLayer::resetFwd() {
   const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
   const MatrixPtr& out = output_.value;
 
-  if (prevIsMKLDNN()) {
+  if (prevIsOnlyMKLDNN()) {
     const MatrixPtr& in = getInputValue(0);
     inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
     CHECK(inVal_) << "Input should be MKLDNNMatrix";
@@ -136,30 +154,21 @@ void MKLDNNFcLayer::resetFwd() {
 
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-  if (!nextIsMKLDNN()) {
-    Argument cpuOutput;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        cpuOutput = outputOtherDevice_[i];
-      }
-    }
-    cpuOutput.setFrameHeight(output_.getFrameHeight());
-    cpuOutput.setFrameWidth(output_.getFrameWidth());
-
-    // fc cpu output value do not need convert
-    cpuOutput.value = output_.value;
+  if (!nextIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
   }
 
   // create forward handle
   prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc =
-      hasBias ? fc_fwd::desc(pk,
-                             inVal_->getMD(),
-                             wgtVal_->getMD(),
-                             biasVal_->getMD(),
-                             outVal_->getMD())
-              : fc_fwd::desc(
-                    pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
   if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
@@ -184,36 +193,38 @@ void MKLDNNFcLayer::resetBwd() {
   const MatrixPtr& wgt = weight_->getWGrad();
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
 
-  // TODO(TJ): merge topdiffs
-  if (nextIsMKLDNN()) {
+  // TODO(TJ): merge outgrad
+  if (nextIsOnlyMKLDNN()) {
     // can not directly cast outputgrad to mkldnnmatrix,
     // since each layer can not write the inputgrad to mkldnn inputgrad.
     // So just create from matrix with outputvalue format.
     const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
     // fc do not need to convert from cpu device since output always nc
     // only need create from cpu device
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
   }
 
-  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
-  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+                      : nullptr;
 
   // create memory primitive desc
   fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
-                                      inVal_->getMD(),
-                                      wgtGrad_->getMD(),
-                                      outGrad_->getMD());
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc =
-      hasBias ? fc_bwdWgt::desc(inVal_->getMD(),
-                                wgtGrad_->getMD(),
-                                biasGrad_->getMD(),
-                                outGrad_->getMD())
-              : fc_bwdWgt::desc(
-                    inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
   fc_bwdWgt::primitive_desc bwdWgtPD =
       fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
 
@@ -227,30 +238,20 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  if (prevIsMKLDNN()) {
-    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+  int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
+    return;
+  }
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
   } else {
-    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
   }
 
-  fc_bwdData::desc bwdDataDesc =
-      fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index e2657a8d5e9d9..e138a6faf181c 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -72,6 +72,8 @@ class MKLDNNFcLayer : public MKLDNNLayer {
    * only would be called when needed
    */
   void resetBwd();
+
+  void convertOutputToOtherDevice() override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 3dd17a36ff7ee..8fe9630e82afb 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -86,10 +86,7 @@ class MKLDNNLayer : public Layer {
     CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
                             << "Please set WITH_MKLDNN=ON "
                             << "and set use_mkldnn=True";
-    if (useGpu_ == true) {
-      LOG(WARNING) << "Do not support GPU yet, will change to useGpu = false";
-      useGpu_ = false;
-    }
+    CHECK(!useGpu_) << "Do not support GPU yet";
 
     // set device id before Layer::init
     setDevice(MKLDNN_DEVICE);
@@ -116,6 +113,12 @@ class MKLDNNLayer : public Layer {
    */
   virtual void convertWeightsToPaddle() {}
 
+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
+
   /**
    * print info about sizes
    */
@@ -147,22 +150,25 @@ class MKLDNNLayer : public Layer {
 
 protected:
   /**
-   * If next layer only has MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * copy image size and sequence info to other device
    */
-  bool nextIsMKLDNN() {
+  void copyOutputInfoToOtherDevice() {
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
     }
-    return outputOtherDevice_.size() == 0;
   }
 
   /**
-   * Is previous layer MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * Is previous layer only has MKLDNN type.
+   * Otherwise, only support the previous layer using CPU device.
    */
-  bool prevIsMKLDNN(int index = 0) {
+  bool prevIsOnlyMKLDNN(int index = 0) {
     int prevDevice = getPrev(index)->getDeviceId();
     if (prevDevice == MKLDNN_DEVICE) {
       return true;
@@ -173,11 +179,23 @@ class MKLDNNLayer : public Layer {
     }
   }
 
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool nextIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
   /**
    * Sync input value data
    */
   void syncInputValue() {
-    if (prevIsMKLDNN()) {
+    if (prevIsOnlyMKLDNN()) {
       return;
     }
     real* iData = getInputValue(0, CPU_DEVICE)->getData();
@@ -190,7 +208,7 @@ class MKLDNNLayer : public Layer {
    * Sync output grad data
    */
   void syncOutputGrad() {
-    if (nextIsMKLDNN()) {
+    if (nextIsOnlyMKLDNN()) {
       return;
     }
 
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 32ae3b1bcf76a..0a355e2644cce 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -31,7 +31,6 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   if (m == nullptr) {
     size_t height = dims[0];
     size_t width = cnts / dims[0];
-    // LOG(INFO) << height << "," << width;
     m = Matrix::create(height, width, false, false);
   }
 
@@ -40,10 +39,8 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
 
   CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  size_t width = m->getWidth();
-  size_t height = m->getHeight();
-  real* data = m->getData();
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
 }
 
 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -51,9 +48,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
                                      memory::format fmt,
                                      engine& eg,
                                      mkldnn::memory::data_type dtype) {
-  memory::desc md = memory::desc(dims, dtype, fmt);
-  memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return create(m, pd);
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
 }
 
 void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
@@ -64,9 +59,7 @@ void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
     return;
   }
   CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }
 
 void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
@@ -77,9 +70,7 @@ void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
     return;
   }
   CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }
 
 void MKLDNNMatrix::reorderOnce(void* srcData,
@@ -120,8 +111,9 @@ void MKLDNNMatrix::downSpatial() {
     return;
   }
 
-  memory::dims srcDims = getDims();
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
   const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
   if (srcDims[H] != 1 || srcDims[W] != 1) {
     // can not down spatial
     return;
@@ -141,13 +133,12 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  void* data = getData();
   mkldnn_primitive_t result;
   mkldnn::error::wrap_c_api(
       mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
       "could not create a memory primitive");
   reset(result);
-  set_data_handle(data);
+  set_data_handle(getData());
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index ea3fd7d461cf8..e50f698b49571 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -56,9 +56,9 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
   /**
    * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change this original dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
    */
   void reorderDataFrom(const MKLDNNMatrixPtr& m,
                        memory::format srcFmt,
@@ -66,9 +66,9 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 
   /**
    * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change the dst dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
    */
   void reorderDataTo(const MKLDNNMatrixPtr& m,
                      memory::format dstFmt,
@@ -90,18 +90,20 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
   /**
    * Get primitive descriptor.
    */
-  mkldnn::memory::primitive_desc getPD() { return this->get_primitive_desc(); }
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
 
   /**
    * Get memory descriptor.
    */
-  mkldnn::memory::desc getMD() { return getPD().desc(); }
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
 
   /**
    * Get dimensions.
    */
   mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMD();
+    mkldnn::memory::desc md = getMemoryDesc();
     const int* src = md.data.dims;
     int ndims = md.data.ndims;
     mkldnn::memory::dims dst;
@@ -116,24 +118,25 @@ class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
    * Get format.
    */
   mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMD().data.format);
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
   }
 
   /**
    * Get memory data type.
    */
   mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMD().data.data_type);
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
   }
 
   /**
    * Get engine.
    */
-  mkldnn::engine getEngine() { return getPD().get_engine(); }
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
 
 protected:
   /**
-   * Do once reorder supported inplace.
+   * Do reorder once.
+   * Can support inplace.
    */
   void reorderOnce(void* srcData,
                    void* dstData,

From c5183caa04557628340983d17a64097f939db132 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 30 Aug 2017 13:37:51 +0800
Subject: [PATCH 12/12] rename

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 29 +++++++++++--------------
 paddle/gserver/layers/MKLDNNLayer.h     | 12 +++++-----
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index a08cca318e5ff..8318c8c519a4c 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -134,7 +134,7 @@ void MKLDNNFcLayer::resetFwd() {
   const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
   const MatrixPtr& out = output_.value;
 
-  if (prevIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
     const MatrixPtr& in = getInputValue(0);
     inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
     CHECK(inVal_) << "Input should be MKLDNNMatrix";
@@ -154,7 +154,7 @@ void MKLDNNFcLayer::resetFwd() {
 
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-  if (!nextIsOnlyMKLDNN()) {
+  if (!outputIsOnlyMKLDNN()) {
     convertOutputToOtherDevice();
   }
 
@@ -194,19 +194,16 @@ void MKLDNNFcLayer::resetBwd() {
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
 
   // TODO(TJ): merge outgrad
-  if (nextIsOnlyMKLDNN()) {
-    // can not directly cast outputgrad to mkldnnmatrix,
-    // since each layer can not write the inputgrad to mkldnn inputgrad.
-    // So just create from matrix with outputvalue format.
-    const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
-    // fc do not need to convert from cpu device since output always nc
-    // only need create from cpu device
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
-  }
-
+  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  // for MKLDNN device:
+  // can not directly cast outputgrad to mkldnnmatrix,
+  // since each layer can not write the inputgrad to mkldnn inputgrad.
+  // So just create from matrix with outputvalue format.
+  // for CPU device:
+  // fc do not need to convert from cpu device since output is always nc format
+  // only need create from cpu device
+  const MatrixPtr& out = getOutput(device).grad;
+  outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
   wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
   biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
                       : nullptr;
@@ -238,7 +235,7 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
   const MatrixPtr& in = getInputGrad(0, device);
   if (in == nullptr) {
     return;
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 8fe9630e82afb..b983b833d510b 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -151,6 +151,8 @@ class MKLDNNLayer : public Layer {
 protected:
   /**
    * copy image size and sequence info to other device
+   * @note: can not directly use Layer::copyOutputToOtherDevice since here only
+   *        copy base info and do not copy data value
    */
   void copyOutputInfoToOtherDevice() {
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
@@ -165,10 +167,10 @@ class MKLDNNLayer : public Layer {
   }
 
   /**
-   * Is previous layer only has MKLDNN type.
+   * If input only has MKLDNN device.
    * Otherwise, only support the previous layer using CPU device.
    */
-  bool prevIsOnlyMKLDNN(int index = 0) {
+  bool inputIsOnlyMKLDNN(int index = 0) {
     int prevDevice = getPrev(index)->getDeviceId();
     if (prevDevice == MKLDNN_DEVICE) {
       return true;
@@ -183,7 +185,7 @@ class MKLDNNLayer : public Layer {
    * If output only has MKLDNN device.
    * Otherwise, other devices should only using CPU device.
    */
-  bool nextIsOnlyMKLDNN() {
+  bool outputIsOnlyMKLDNN() {
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
       CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
           << "Only support other device is CPU yet";
@@ -195,7 +197,7 @@ class MKLDNNLayer : public Layer {
    * Sync input value data
    */
   void syncInputValue() {
-    if (prevIsOnlyMKLDNN()) {
+    if (inputIsOnlyMKLDNN()) {
       return;
     }
     real* iData = getInputValue(0, CPU_DEVICE)->getData();
@@ -208,7 +210,7 @@ class MKLDNNLayer : public Layer {
    * Sync output grad data
    */
   void syncOutputGrad() {
-    if (nextIsOnlyMKLDNN()) {
+    if (outputIsOnlyMKLDNN()) {
       return;
     }