From 06748210d4771b37bd964e25513102cd2e0fccbf Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 18:05:41 +0800
Subject: [PATCH 01/10] Fix some link errors about NNPACK.

---
 CMakeLists.txt                                |  3 ++-
 .../nnpack => cmake/external}/nnpack.cmake    | 14 +++++++++++
 paddle/function/CMakeLists.txt                |  1 -
 paddle/function/nnpack/NNPACKConvOp.cpp       | 23 +++++++++++--------
 4 files changed, 29 insertions(+), 12 deletions(-)
 rename {paddle/function/nnpack => cmake/external}/nnpack.cmake (54%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c713db3e3854..af58957ea8933 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,8 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
 add_subdirectory(proto)
diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake
similarity index 54%
rename from paddle/function/nnpack/nnpack.cmake
rename to cmake/external/nnpack.cmake
index 7182730ae8f13..d42bcb0f32904 100644
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/cmake/external/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
   set(NNPACK_FOUND ON)
   INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
   message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654cfb..a5b14c0c71c18 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
   if(WITH_TESTING)
     add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e8080c3d714b3..e83bca5d9f0a6 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -58,18 +58,10 @@ class NNPACKConvFunction : public ConvFunctionBase {
     workspaceBuffer_ = nullptr;
     workspaceSize_ = 0;
 
-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
   }
 
   ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
     if (workspaceBuffer_) {
       free(workspaceBuffer_);
     }
@@ -225,14 +217,25 @@ class NNPACKConvFunction : public ConvFunctionBase {
     }
   }
 
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
   size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
 
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 
 }  // namespace paddle

From 891e5dcc48590375d37364634838b6da260fd41e Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 12 Jul 2017 20:13:07 +0800
Subject: [PATCH 02/10] Modify the default value of nnpack_allocate_outside.

---
 paddle/function/nnpack/NNPACKConvOp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e83bca5d9f0a6..f0ec77a5d0033 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
             "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
              0,

From 68adb9541d339ffd0df43a7a45a5a4adf16f2067 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 15:00:18 +0800
Subject: [PATCH 03/10] enbale tensor memory test

---
 paddle/framework/CMakeLists.txt |   2 +-
 paddle/framework/tensor.h       |  50 ++++++++------
 paddle/framework/tensor_test.cc | 118 +++++++++++++++++---------------
 3 files changed, 92 insertions(+), 78 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8415ce67e9039..f7f606e4b8cf8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,7 +2,7 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim paddle_memory)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 62e0710a8244c..81db722c99f07 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,8 +29,6 @@ class Tensor {
  public:
   Tensor() : numel_(0), offset_(0) {}
 
-  Tensor& operator=(const Tensor& src) = delete;
-
   template <typename T>
   const T* data() const {
     CheckDims<T>();
@@ -39,13 +37,13 @@ class Tensor {
   }
 
   template <typename T>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+  T* mutable_data(DDim dims, platform::Place place) {
     set_dims(dims);
     return mutable_data<T>(place);
   }
 
   template <typename T>
-  T* mutable_data(paddle::platform::Place place) {
+  T* mutable_data(platform::Place place) {
     PADDLE_ENFORCE(numel_ > 0,
                    "Tensor::numel_ must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
@@ -53,7 +51,18 @@ class Tensor {
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->size() < numel_ * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
+      switch (place.which()) {
+        case 0:
+          holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
+          break;
+
+        case 1:
+          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+          break;
+      }
+
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -69,7 +78,7 @@ class Tensor {
   }
 
   template <typename T>
-  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
     PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
@@ -119,38 +128,37 @@ class Tensor {
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
-    virtual paddle::platform::Place place() const = 0;
+    virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
   };
 
-  template <typename T>
+  template <typename T, typename PlaceType>
   struct PlaceholderImpl : public Placeholder {
    private:
+    template <typename PType>
     class Deleter {
      public:
-      Deleter(platform::Place place) : place_(place) {}
-      void operator()(T* ptr) {
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
+      Deleter(PType place) : place_(place) {}
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
 
      private:
-      paddle::platform::Place place_;
+      PType place_;
     };
 
    public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
-               Deleter(place)),
+    PlaceholderImpl(PlaceType place, size_t size)
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
+               Deleter<PlaceType>(place)),
           place_(place),
           size_(size) {}
 
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
 
-    std::unique_ptr<T, Deleter> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    platform::Place place_;  // record the place of ptr_.
+    size_t size_;            // size of the memory block.
   };
 
   template <typename T>
@@ -166,7 +174,7 @@ class Tensor {
   DDim dims_;
   size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};
+};                 // namespace framework
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 255f69372f4f0..79bd0cc607b10 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -47,7 +47,7 @@ TEST(Tensor, DataAssert) {
 
 /* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
-
+*/
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
@@ -72,28 +72,29 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-
-  {
-    Tensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1, p2);
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
-    EXPECT_EQ(p1, p2);
-  }
+  /*
+    {
+      Tensor src_tensor;
+      float* p1 = nullptr;
+      float* p2 = nullptr;
+      // initialization
+      p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+      EXPECT_NE(p1, nullptr);
+      // set src_tensor a new dim with large size
+      // momery is supposed to be re-allocated
+      p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+      EXPECT_NE(p2, nullptr);
+      EXPECT_NE(p1, p2);
+      // set src_tensor a new dim with same size
+      // momery block is supposed to be unchanged
+      p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+      EXPECT_EQ(p1, p2);
+      // set src_tensor a new dim with smaller size
+      // momery block is supposed to be unchanged
+      p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+      EXPECT_EQ(p1, p2);
+    }
+    */
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -108,9 +109,11 @@ TEST(Tensor, ShareDataFrom) {
       dst_tensor.ShareDataFrom<float>(src_tensor);
     } catch (EnforceNotMet err) {
       caught = true;
-      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
-first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
-++i) { ASSERT_EQ(what[i], msg[i]);
+      std::string msg =
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
       }
     }
     ASSERT_TRUE(caught);
@@ -120,13 +123,15 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
+  /*
+    {
+      Tensor src_tensor;
+      Tensor dst_tensor;
+      src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+      dst_tensor.ShareDataFrom<int>(src_tensor);
+      ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+    }
+    */
 }
 
 TEST(Tensor, Slice) {
@@ -155,27 +160,29 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  {
-    Tensor src_tensor;
-    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
-    DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
+  /*
+    {
+      Tensor src_tensor;
+      src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+      Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+      DDim slice_dims = slice_tensor.dims();
+      ASSERT_EQ(arity(slice_dims), 2);
+      EXPECT_EQ(slice_dims[0], 4);
+      EXPECT_EQ(slice_dims[1], 9);
+
+      uintptr_t src_data_address =
+          reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+      uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+          src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+      uintptr_t slice_data_address =
+          reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+      uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+          slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+      EXPECT_EQ(src_data_address, src_mutable_data_address);
+      EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+      EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+    }
+    */
 }
 
 TEST(Tensor, CopyFrom) {
@@ -202,5 +209,4 @@ TEST(Tensor, CopyFrom) {
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
-}
-*/
\ No newline at end of file
+}
\ No newline at end of file

From 66cf21c880fba791910dc449dfc716d11c52803f Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 07:16:11 +0000
Subject: [PATCH 04/10] fix compile error

---
 paddle/framework/CMakeLists.txt |  2 +-
 paddle/framework/tensor_test.cc | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f7f606e4b8cf8..b8bfab5320c3f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,7 +2,7 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim place paddle_memory)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 79bd0cc607b10..30b1448a9b2af 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,7 +72,7 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-  /*
+    #ifdef __CUDACC__
     {
       Tensor src_tensor;
       float* p1 = nullptr;
@@ -94,7 +94,7 @@ TEST(Tensor, MutableData) {
       p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
       EXPECT_EQ(p1, p2);
     }
-    */
+    #endif
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -123,7 +123,7 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  /*
+  #ifdef __CUDACC__
     {
       Tensor src_tensor;
       Tensor dst_tensor;
@@ -131,7 +131,7 @@ TEST(Tensor, ShareDataFrom) {
       dst_tensor.ShareDataFrom<int>(src_tensor);
       ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
     }
-    */
+    #endif
 }
 
 TEST(Tensor, Slice) {
@@ -160,7 +160,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  /*
+  #ifdef __CUDACC__
     {
       Tensor src_tensor;
       src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -182,7 +182,7 @@ TEST(Tensor, Slice) {
       EXPECT_EQ(slice_data_address, slice_mutable_data_address);
       EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
     }
-    */
+    #endif
 }
 
 TEST(Tensor, CopyFrom) {
@@ -209,4 +209,4 @@ TEST(Tensor, CopyFrom) {
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
-}
\ No newline at end of file
+}

From afa2a88d7896a03feb18b3cf6e6736c8ca79fcad Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sat, 15 Jul 2017 15:25:06 +0800
Subject: [PATCH 05/10] add conditional compilation for tensor

---
 paddle/framework/tensor.h       |   5 ++
 paddle/framework/tensor_test.cc | 108 ++++++++++++++++----------------
 2 files changed, 59 insertions(+), 54 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 81db722c99f07..29bad7a00a439 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -51,6 +51,7 @@ class Tensor {
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
         || holder_->size() < numel_ * sizeof(T) + offset_) {
+#ifdef __CUDACC__
       switch (place.which()) {
         case 0:
           holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
@@ -62,6 +63,10 @@ class Tensor {
               boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
           break;
       }
+#else
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+#endif
 
       offset_ = 0;
     }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 30b1448a9b2af..84c6f0cf65588 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,29 +72,29 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-    #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      float* p1 = nullptr;
-      float* p2 = nullptr;
-      // initialization
-      p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
-      EXPECT_NE(p1, nullptr);
-      // set src_tensor a new dim with large size
-      // momery is supposed to be re-allocated
-      p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
-      EXPECT_NE(p2, nullptr);
-      EXPECT_NE(p1, p2);
-      // set src_tensor a new dim with same size
-      // momery block is supposed to be unchanged
-      p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
-      EXPECT_EQ(p1, p2);
-      // set src_tensor a new dim with smaller size
-      // momery block is supposed to be unchanged
-      p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
-      EXPECT_EQ(p1, p2);
-    }
-    #endif
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
+    EXPECT_EQ(p1, p2);
+  }
+#endif
 }
 
 TEST(Tensor, ShareDataFrom) {
@@ -123,15 +123,15 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-  #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      Tensor dst_tensor;
-      src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-      dst_tensor.ShareDataFrom<int>(src_tensor);
-      ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-    }
-    #endif
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataFrom<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
 }
 
 TEST(Tensor, Slice) {
@@ -160,29 +160,29 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-  #ifdef __CUDACC__
-    {
-      Tensor src_tensor;
-      src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-      Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
-      DDim slice_dims = slice_tensor.dims();
-      ASSERT_EQ(arity(slice_dims), 2);
-      EXPECT_EQ(slice_dims[0], 4);
-      EXPECT_EQ(slice_dims[1], 9);
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
 
-      uintptr_t src_data_address =
-          reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-      uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
-          src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
-      uintptr_t slice_data_address =
-          reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-      uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
-          slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
-      EXPECT_EQ(src_data_address, src_mutable_data_address);
-      EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-      EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-    }
-    #endif
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
 }
 
 TEST(Tensor, CopyFrom) {

From 9e0c6800c53701fc50dfb69a2c8b6de19c52c559 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 15 Jul 2017 20:18:54 +0800
Subject: [PATCH 06/10] Python Generate OpCreation Methods by OpProto

All OpCreation method are generated by
`create_op_creation_methods::__bootstrap__` method, and stores in
`op_creations` object and its methods.

There are three parts to implement this feature.

1. Get all registered `OpProto` from C++ side. It is implemented in
`get_all_op_protos` method.
1. Create a function to convert `kwargs` to `OpDesc` base on each op's
`OpProto`. The `OpDescCreationMethod` class.
1. Convert `OpProto` to `docstring` by `get_docstring_from_op_proto`
method.

All three methods are unit tested. The `__bootstrap__` just combines
them together and create a method in runtime.

For details, please reference the doc string in
`create_op_creation_methods.py` and the unit test
`test_op_creation_methods.py`.
---
 paddle/framework/op_registry.h                |  24 ++
 paddle/framework/operator.cc                  |  28 +-
 paddle/framework/operator.h                   |   8 +-
 paddle/pybind/pybind.cc                       |  17 ++
 .../framework/create_op_creation_methods.py   | 235 +++++++++++++++++
 .../tests/test_op_creation_methods.py         | 243 +++++++++++++++++-
 python/paddle/v2/optimizer.py                 |   2 +
 7 files changed, 539 insertions(+), 18 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index de20e7af05a8d..3d67541db2022 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <algorithm>
+#include <atomic>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -199,8 +200,12 @@ class OpRegistry {
   }
 
   static OperatorPtr CreateOp(const OpDesc& op_desc) {
+    //! Create a OpPtr by type.
     std::string op_type = op_desc.type();
     OperatorPtr op(creators().at(op_type)());
+
+    //! Fill op's data member. Not use constructor because it will be noising
+    //! for Op developer.
     op->desc_ = op_desc;
     op->inputs_.reserve((size_t)op_desc.inputs_size());
     std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
@@ -208,10 +213,18 @@ class OpRegistry {
     op->outputs_.reserve((size_t)op_desc.outputs_size());
     std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
               std::back_inserter(op->outputs_));
+
+    //! Fill attrs, and validate attrs.
     for (auto& attr : op_desc.attrs()) {
       op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
     op_checkers().at(op_type).Check(op->attrs_);
+
+    //! Convert Temporary variable name to an unique variable name.
+    AssignTempVariable(op.get());
+
+    //! Other op's custom Init for a complex Op. For simple Op, the Init
+    //! method do nothing.
     op->Init();
     return op;
   }
@@ -222,6 +235,17 @@ class OpRegistry {
   };
 
  private:
+  static void AssignTempVariable(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->Type();
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+
   static std::unordered_map<std::string, OpCreator>& creators() {
     static std::unordered_map<std::string, OpCreator> creators_;
     return creators_;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index d065670829ccb..a467d328e1d57 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -19,23 +19,21 @@ namespace framework {
 
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
-  ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
-  ss << "inputs = [";
-  for (auto& ipt : inputs_) {
-    ss << ipt << ", ";
+  ss << "Op(" << Type() << "), inputs:(";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    ss << inputs_[i];
+    if (i != inputs_.size() - 1) {
+      ss << ", ";
+    }
   }
-  ss << "]\n";
-  ss << "outputs = [";
-  for (auto& opt : outputs_) {
-    ss << opt << ", ";
+  ss << "), outputs:(";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    ss << outputs_[i];
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
   }
-  ss << "]\n";
-  ss << "attr_keys = [";
-  for (auto& attr : attrs_) {
-    ss << attr.first << ", ";
-  }
-  ss << "]\n";
+  ss << ").";
   return ss.str();
 }
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index cf79f379fae1e..cc166048b7bf8 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -39,6 +39,13 @@ using OperatorPtr = std::shared_ptr<OperatorBase>;
  */
 class OperatorBase {
  public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
+
   virtual ~OperatorBase() {}
 
   template <typename T>
@@ -62,7 +69,6 @@ class OperatorBase {
   virtual void Run(const ScopePtr& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
- protected:
   std::string Type() const { return desc_.type(); }
 
  public:
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c1a025ed0492f..b5ead21fd0128 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -63,6 +63,23 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+
+  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
+      .def("__str__", &pd::OperatorBase::DebugString)
+      .def_static("create", [](const std::string& protobin) {
+        pd::OpDesc desc;
+        PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                       "Cannot parse user input to OpDesc");
+        PADDLE_ENFORCE(desc.IsInitialized(),
+                       "User OpDesc is not initialized, reason %s",
+                       desc.InitializationErrorString());
+        return pd::OpRegistry::CreateOp(desc);
+      });
 
   return m.ptr();
 }
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
index 2fcdfead25414..c2a7ae7692b08 100644
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -1,11 +1,246 @@
 import paddle.v2.framework.core as core
 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import cStringIO
 
 
 def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
         op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
         ret_values.append(op_proto)
     return ret_values
+
+
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+    
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, op_proto_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = op_desc_pb2.OpDesc()
+
+        # Inputs
+        ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output(
+            "input", kwargs, self.__op_proto__.inputs)
+        op_desc.inputs.extend(ipts)
+        if ipt_format is not None:
+            op_desc.attrs.extend([ipt_format])
+
+        # Outputs
+        outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output(
+            "output", kwargs, self.__op_proto__.outputs)
+        op_desc.outputs.extend(outs)
+        if out_format is not None:
+            op_desc.attrs.extend([out_format])
+        if len(tmp_index) != 0:
+            tmp_index_attr = op_desc.attrs.add()
+            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.name = "temporary_index"
+            tmp_index_attr.ints.extend(tmp_index)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == attr_type_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == attr_type_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == attr_type_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == attr_type_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+
+        return op_desc
+
+    @staticmethod
+    def extract_input_or_output(in_out, kwargs, meta):
+        """
+        Extract input variable names or output variable names from key-word 
+        arguments, which base on VarProtos.
+        
+        :param in_out: "input" or "output"
+        :param kwargs: key-word arguments that user inputted.
+        :param meta: a list of VarProto
+        :return: The three object will be return. The variable names. The 
+        input_format or output_format attribute(None if the input or output is 
+        not multiple). The temporary variable index list.
+        """
+        multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta))
+        tmp_index = []
+        retv = []
+        if multiple:
+            var_format = op_desc_pb2.AttrDesc()
+            var_format.type = attr_type_pb2.INTS
+            var_format.name = "%s_format" % in_out
+            var_format.ints.append(0)
+
+            for var in meta:
+                var_name = var.name
+
+                if var.temporary:
+                    var_name = [core.var_names.temp()]
+                    tmp_index.append(len(retv))
+                else:
+                    var_name = kwargs.get(var_name, [])
+                if not isinstance(var_name, list):
+                    var_name = [var_name]
+                retv.extend(var_name)
+                var_format.ints.append(len(var_name) + var_format.ints[-1])
+            return retv, var_format, tmp_index
+        else:
+            for var in meta:
+                if var.temporary:
+                    retv.append(kwargs.get(var.name, core.var_names.temp()))
+                    tmp_index.append(len(retv))
+                else:
+                    retv.append(kwargs.get(var.name, core.var_names.empty()))
+            return retv, None, tmp_index
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+def get_docstring_from_op_proto(op_proto):
+    """
+    Generate docstring from a OpProto
+    :param op_proto: a OpProto instance.
+    :type op_proto: op_proto_pb2.OpProto
+    :return: docstring
+    """
+    if not isinstance(op_proto, op_proto_pb2.OpProto):
+        raise TypeError("Input must be OpProto")
+    f = cStringIO.StringIO()
+    f.write(op_proto.comment)
+    f.write("\n")
+
+    def __append_param__(name, comment, type):
+        # Maybe replace the following line with template engine is better.
+        f.write(":param ")
+        f.write(name)
+        f.write(": ")
+        f.write(comment)
+        f.write("\n")
+        f.write(":type ")
+        f.write(name)
+        f.write(": ")
+        f.write(type)
+        f.write("\n")
+
+    for ipt in op_proto.inputs:
+        __append_param__(ipt.name, ipt.comment, "list | basestr"
+                         if ipt.multiple else "basestr")
+
+    temp_var_prefix = \
+        "This is a temporary variable. It does not have to set by user. "
+    for opt in op_proto.outputs:
+        __append_param__(opt.name, opt.comment if not opt.temporary else
+                         temp_var_prefix + opt.comment, "list | basestr"
+                         if opt.multiple else "basestr")
+
+    for attr in op_proto.attrs:
+        attr_type = None
+        if attr.type == attr_type_pb2.INT:
+            attr_type = "int"
+        elif attr.type == attr_type_pb2.FLOAT:
+            attr_type = "float"
+        elif attr.type == attr_type_pb2.STRING:
+            attr_type = "basestr"
+        elif attr.type == attr_type_pb2.INTS:
+            attr_type = "list of int"
+        elif attr.type == attr_type_pb2.FLOATS:
+            attr_type = "list of float"
+        elif attr.type == attr_type_pb2.STRINGS:
+            attr_type = "list of basestr"
+
+        if attr_type is None:
+            raise RuntimeError("Not supported attribute type " + attr.type)
+
+        __append_param__(attr.name, attr.comment, attr_type)
+
+    return f.getvalue()
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    __impl__.__doc__ = get_docstring_from_op_proto(op_proto)
+    return __impl__
+
+
+class OpCreationsHolder(object):
+    """
+    A object will holds all op creation methods.
+    
+    Use `op_creations.xxx_op` to access them.
+    """
+    pass
+
+
+op_creations = OpCreationsHolder()
+
+
+def __bootstrap__():
+    """
+    Bootstrap function for this module. It will dynamic create all op creation
+    methods in runtime.
+    """
+    for op_proto in get_all_op_protos():
+        func = create_op_creation_method(op_proto)
+        func.__name__ = str(op_proto.type)
+        setattr(op_creations, func.__name__, func)
+
+
+__bootstrap__()
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
index b205e2cabb99a..41db7c0d535aa 100644
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -1,9 +1,13 @@
 import unittest
 import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
 
 
-class TestOpCreationsMethods(unittest.TestCase):
-    def test_all_protos(self):
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
         all_protos = creation.get_all_op_protos()
         self.assertNotEqual(0, len(all_protos))
 
@@ -11,5 +15,240 @@ def test_all_protos(self):
             self.assertTrue(each.IsInitialized())
 
 
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op.comment = "not matter"
+
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+        output = method(X="a", Y="b", Z="c")
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(["a", "b"])
+        expected.outputs.append("c")
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "fc"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = op_desc_pb2.OpDesc()
+        expected1.inputs.extend(['x', 'w', 'b'])
+        expected1.outputs.extend(['y'])
+        expected1.type = 'fc'
+        attr = expected1.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3])
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = op_desc_pb2.OpDesc()
+        expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b'])
+        expected2.outputs.extend(['y'])
+        expected2.type = 'fc'
+        attr = expected2.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 3, 6, 7])
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(['a'])
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = attr_type_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = attr_type_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = attr_type_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = attr_type_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = attr_type_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+    def test_input_temporary_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        out = op.outputs.add()
+        out.name = "OUT"
+        out.comment = ""
+
+        out = op.outputs.add()
+        out.name = "TMP"
+        out.comment = ""
+        out.temporary = True
+
+        out = op.outputs.add()
+        out.name = "OUT2"
+        out.comment = ""
+        op.comment = ""
+
+        method = creation.OpDescCreationMethod(op)
+        generated = method(OUT="a", OUT2="b")
+        desc = op_desc_pb2.OpDesc()
+        desc.outputs.extend(["a", core.var_names.temp(), "b"])
+        desc.type = "test"
+        attr = desc.attrs.add()
+        attr.name = "temporary_index"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.append(2)
+        self.assertEqual(generated, desc)
+
+
+class TestOpCreationDocStr(unittest.TestCase):
+    def test_all(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        op.comment = """Test Op.
+
+This op is used for unit test, not a real op.
+"""
+        a = op.inputs.add()
+        a.name = "a"
+        a.comment = "Input a for test op"
+        a.multiple = True
+
+        b = op.inputs.add()
+        b.name = "b"
+        b.comment = "Input b for test op"
+        self.assertTrue(op.IsInitialized())
+
+        o1 = op.outputs.add()
+        o1.name = "output"
+        o1.comment = "The output of test op"
+
+        o2 = op.outputs.add()
+        o2.name = "temp output"
+        o2.comment = "The temporary output of test op"
+        o2.temporary = True
+
+        test_str = op.attrs.add()
+        test_str.name = "str_attr"
+        test_str.type = attr_type_pb2.STRING
+        test_str.comment = "A string attribute for test op"
+
+        actual = creation.get_docstring_from_op_proto(op)
+        expected_docstring = '''Test Op.
+
+This op is used for unit test, not a real op.
+
+:param a: Input a for test op
+:type a: list | basestr
+:param b: Input b for test op
+:type b: basestr
+:param output: The output of test op
+:type output: basestr
+:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
+:type temp output: basestr
+:param str_attr: A string attribute for test op
+:type str_attr: basestr
+'''
+        self.assertEqual(expected_docstring, actual)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = creation.op_creations.add_two(X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
+                         str(add_op))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index b6ee51cfe899f..173a30a41181c 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -25,6 +25,8 @@ def __impl__():
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
+        if swig_api is None:
+            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 

From c5bc126762031231eb8a144d3318c9dcbaea68ed Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 12:42:04 +0800
Subject: [PATCH 07/10] Follow comment, rename to `GenerateTempVariableName`

---
 paddle/framework/op_registry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index b627b4a60a728..ec237950dff72 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -220,7 +220,7 @@ class OpRegistry {
     op_checkers().at(op_type).Check(op->attrs_);
 
     //! Convert Temporary variable name to an unique variable name.
-    AssignTempVariable(op.get());
+    GenerateTempVariableName(op.get());
 
     //! Other op's custom Init for a complex Op. For simple Op, the Init
     //! method do nothing.
@@ -234,7 +234,7 @@ class OpRegistry {
   };
 
  private:
-  static void AssignTempVariable(OperatorBase* op) {
+  static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
     for (auto& outname : op->outputs_) {
       if (outname == OperatorBase::TMP_VAR_NAME()) {

From c78a5e5da24e7e7edc7d5cfd92b349f3913773ac Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 13:11:47 +0800
Subject: [PATCH 08/10] Fix merge error before

---
 python/paddle/v2/optimizer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 260a5094695d4..ba581980334fe 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,4 +1,3 @@
-import py_paddle.swig_paddle as swig_api
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -17,6 +16,7 @@
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -25,8 +25,6 @@ def __impl__():
 
         self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
             __impl__)
-        if swig_api is None:
-            raise RuntimeError("paddle.v2 currently need swig_paddle")
         self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
             self.__opt_conf_proto__)
 
@@ -37,18 +35,22 @@ def enable_types(self):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
     def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
             self.__opt_conf__, pserver_spec, use_etcd)
 

From 78bd815e8504496ccae388bb799cc8026427084c Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 17 Jul 2017 19:48:33 +0800
Subject: [PATCH 09/10] refine conditional compilation and remove `numel_`

---
 paddle/framework/tensor.h | 40 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 29bad7a00a439..b405e3877c9f1 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -27,7 +27,7 @@ namespace framework {
 
 class Tensor {
  public:
-  Tensor() : numel_(0), offset_(0) {}
+  Tensor() : offset_(0) {}
 
   template <typename T>
   const T* data() const {
@@ -44,30 +44,26 @@ class Tensor {
 
   template <typename T>
   T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(numel_ > 0,
-                   "Tensor::numel_ must be larger than zero to call "
+    PADDLE_ENFORCE(product(dims_) > 0,
+                   "Tensor's numel must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
     if (holder_ == nullptr ||
         !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
 #ifdef __CUDACC__
-      switch (place.which()) {
-        case 0:
-          holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-              boost::get<platform::GPUPlace>(place), numel_ * sizeof(T)));
-          break;
-
-        case 1:
-          holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-              boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
-          break;
-      }
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), numel_ * sizeof(T)));
+        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
 #endif
-
+      } else {
+        PADDLE_ENFORCE(true, "Unknown 'place'.");
+      }
       offset_ = 0;
     }
     return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -88,7 +84,7 @@ class Tensor {
                        platform::is_cpu_place(dst_place),
                    "Tensor::CopyFrom only support CPU now.");
     src.CheckDims<T>();
-    size_t size = src.numel_ * sizeof(T);
+    size_t size = product(src.dims_) * sizeof(T);
     set_dims(src.dims());
     const void* src_ptr = static_cast<const void*>(src.data<T>());
     void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -122,7 +118,6 @@ class Tensor {
       return;
     }
     dims_ = dims;
-    numel_ = product(dims_);
   }
 
   DDim dims() const { return dims_; }
@@ -170,16 +165,15 @@ class Tensor {
   inline void CheckDims() const {
     PADDLE_ENFORCE(holder_ != nullptr,
                    "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                    "first to re-allocate memory.");
   }
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
   DDim dims_;
-  size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};                 // namespace framework
+};
 
 }  // namespace framework
 }  // namespace paddle

From 78fa5e307da3cb32706f396346d3db7a875b4178 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 17 Jul 2017 20:00:58 +0800
Subject: [PATCH 10/10] Add DDim::size()

---
 paddle/framework/ddim.cc      | 2 ++
 paddle/framework/ddim.h       | 2 ++
 paddle/framework/ddim_test.cc | 1 +
 3 files changed, 5 insertions(+)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 73f5499ad1575..b6ad8b60aaf7b 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -117,6 +117,8 @@ int DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index a0c2a8a74afde..7bc21a1e3455b 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -50,6 +50,8 @@ struct DDim {
 
   DDimVar getVar() { return var; }
 
+  ssize_t size() const;
+
   bool operator==(DDim d) const;
 
   bool operator!=(DDim d) const;
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 6a099f2aeb4aa..9d18a2972ce62 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,6 +49,7 @@ TEST(DDim, Equality) {
 
   // arity of a DDim
   EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);