From 28942cddcc23d27df15b1009305d4b92c8a89337 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Tue, 29 May 2018 13:48:06 +0800
Subject: [PATCH 01/26] refine unit test

---
 src/framework/op_registry.h              |  8 ------
 src/io.cpp                               | 36 ++++--------------------
 src/operators/kernel/arm/conv_kernel.cpp | 11 ++------
 test/executor_for_test.h                 |  6 ++--
 test/operators/test_sigmoid_op.cpp       |  4 ---
 test/operators/test_softmax_op.cpp       |  2 +-
 6 files changed, 11 insertions(+), 56 deletions(-)
diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h
index 233de642be7..62398dcb15d 100644
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -90,14 +90,6 @@ class OpRegistry {
       const std::string& type, const VariableNameMap& inputs,
       const VariableNameMap& outputs, const AttributeMap attrs,
       std::shared_ptr<paddle_mobile::framework::Scope> scope) {
-    LOG(paddle_mobile::kLOG_DEBUG1) << " type: " << type;
-    LOG(paddle_mobile::kLOG_DEBUG1) << " input size: " << inputs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " output size: " << outputs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " attr size: " << attrs.size();
-    LOG(paddle_mobile::kLOG_DEBUG1)
-        << " OpInfoMap size: " << OpInfoMap<Dtype>::Instance()->map().size();
-    LOG(paddle_mobile::kLOG_DEBUG1) << " has type: " << type << " "
-                                    << OpInfoMap<Dtype>::Instance()->Has(type);
     auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
     auto op = info.Creator()(type, inputs, outputs, attrs, scope);
     return std::shared_ptr<OperatorBase<Dtype>>(op);
diff --git a/src/io.cpp b/src/io.cpp
index f03e9a56e2d..d92eec510e3 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -45,7 +45,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   printf("%s \n", file_name);
   FILE *fp;
   fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, "open failed !");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
 
   fseek(fp, 0, SEEK_END);
   size_t size = ftell(fp);
@@ -210,7 +210,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
           tensor->Resize(framework::make_ddim(dim));
         } else {
           auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 1, "dim size is 0");
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
           dim[0] = 1;
           auto tensor = var->GetMutable<framework::LoDTensor>();
           tensor->Resize(framework::make_ddim(dim));
@@ -380,7 +380,8 @@ void Executor<Dtype, P>::InitMemory() {
                    program_.model_path + "/" + var_desc->Name());
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::Tensor>();
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
+
           tensor->template mutable_data<Ptype>();
         }
       }
@@ -388,44 +389,17 @@ void Executor<Dtype, P>::InitMemory() {
   }
 }
 
-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::predict(
-    framework::Tensor &t) {
-  // feed
-  auto scope = program_.scope;
-  framework::Variable *g_feed_value = scope->Var("pixel");
-  auto tensor = g_feed_value->GetMutable<framework::Tensor>();
-  tensor->ShareDataWith(t);
-
-  framework::Variable *con_output = scope->Var("conv2d_0.tmp_0");
-  framework::Tensor *output_tensor =
-      con_output->GetMutable<framework::Tensor>();
-  output_tensor->mutable_data<float>({1, 16, 32, 32});
-  //  std::cout << typeid(output_tensor).name() << std::endl;
-  //  std::cout << "output_tensor dims: " << output_tensor->dims() <<
-  //  std::endl;
-
-  std::shared_ptr<framework::Tensor> out_tensor =
-      std::make_shared<framework::LoDTensor>();
-  out_tensor.reset(output_tensor);
-
-  predict(t, 0);
-  return out_tensor;
-}
-
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
   framework::Variable *g_feed_value = program_.scope->Var("feed");
   auto feed_tensor = g_feed_value->GetMutable<framework::LoDTensor>();
   feed_tensor->Resize(t.dims());
-
   feed_tensor->ShareDataWith(t);
-
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(block_id);
   for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
     auto op = ops_of_block_[*to_predict_block.get()][j];
-    op->Run();
+      op->Run();
   }
 }
 
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index c8ac141f9ca..51d99605774 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -44,13 +44,13 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   std::vector<int> paddings = param.Paddings();
   std::vector<int> dilations = param.Dilations();
 
-  DLOG << " compute end get Attrs " << strides[0];
+//  DLOG << " compute end get Attrs " << strides[0];
 
   const int batch_size = static_cast<int>(input->dims()[0]);
 
   std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
 
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
   size_t data_dim = filter_shape_vec.size() - 2;
   std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
   col_shape_vec[0] = input->dims()[1] / groups;
@@ -71,8 +71,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
-  DLOG << " col_shape = " << col_shape;
-  DLOG << " col_matrix_shape = " << col_matrix_shape;
 
   framework::DDim input_shape = framework::slice_ddim(
       input->dims(), 1, static_cast<int>(input->dims().size()));
@@ -80,8 +78,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   framework::DDim filter_matrix_shape = {filter.dims()[0],
                                          filter.numel() / filter.dims()[0]};
   filter.Resize(filter_matrix_shape);
-  DLOG << " filter.deims() = " << filter.dims();
-
   framework::DDim output_matrix_shape = {
       output->dims()[1],
       output->numel() / (output->dims()[0] * output->dims()[1])};
@@ -118,9 +114,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
       // gemm
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      DLOG << " out_slice " << out_slice.dims();
-      DLOG << " filter_slice " << filter_slice.dims();
-      DLOG << " col_matrix " << col_matrix.dims();
       math::matmul<float>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 35bc71f1101..045658cbfc8 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -77,13 +77,13 @@ class Executor4Test : public Executor<DeviceType> {
                                   const DDim &dDim) {
     auto scope = this->program_.scope;
     Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<Tensor>();
+    auto tensor = g_feed_value->GetMutable<LoDTensor>();
     tensor->ShareDataWith(t);
 
     Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<Tensor>();
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>(dDim);
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
     out_tensor.reset(output_tensor);
 
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index e053ca1e904..adf03761327 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -19,16 +19,12 @@ limitations under the License. */
 int main() {
   paddle_mobile::framework::Tensor input;
   paddle_mobile::framework::Tensor output;
-  DLOG << 1;
   SetupTensor<float>(&input, {1, 4, 60, 60}, static_cast<float>(0),
                      static_cast<float>(1));
-  DLOG << 2;
 
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 4, 60, 60});
   output.Resize(out_ddim);
-  DLOG << 3;
   paddle_mobile::operators::sigmoid(&input, &output);
-  DLOG << 4;
   auto *output_ptr = output.data<float>();
   for (int j = 0; j < output.numel(); ++j) {
     DLOG << " value of output: " << output_ptr[j];
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index 5dd42e83e3c..ed5a1a49f55 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("models/mobilenet"));
+  auto program = loader.Load(std::string("../models/mobilenet"));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
   }

From d5230003971cdabae4c36521b389e1c93fe1e222 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Tue, 29 May 2018 13:55:25 +0800
Subject: [PATCH 02/26] modify code style

---
 src/io.cpp                               | 2 +-
 src/operators/kernel/arm/conv_kernel.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/io.cpp b/src/io.cpp
index d92eec510e3..e0df6e732ec 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -399,7 +399,7 @@ void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
       to_predict_program_->Block(block_id);
   for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
     auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
+    op->Run();
   }
 }
 
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index 51d99605774..35ee1950329 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -44,7 +44,7 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   std::vector<int> paddings = param.Paddings();
   std::vector<int> dilations = param.Dilations();
 
-//  DLOG << " compute end get Attrs " << strides[0];
+  //  DLOG << " compute end get Attrs " << strides[0];
 
   const int batch_size = static_cast<int>(input->dims()[0]);
 

From 030daef95b2c2cae0efc44c065f27366eae158ab Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Tue, 29 May 2018 15:56:32 +0800
Subject: [PATCH 03/26] update excutor for test

---
 src/framework/tensor.h                     |   3 +-
 src/io.cpp                                 |  11 +-
 src/operators/kernel/arm/conv_kernel.cpp   |   3 +-
 src/operators/op_param.h                   |  12 +-
 test/CMakeLists.txt                        |   4 +-
 test/executor_for_test.h                   |  40 +++--
 test/operators/test_batchnorm_op.cpp       |  12 +-
 test/operators/test_box_coder_op.cpp       |   8 +-
 test/operators/test_concat_op.cpp          | 188 ++++++--------------
 test/operators/test_elementwise_add_op.cpp | 151 ++++------------
 test/operators/test_fushion_fc_op.cpp      |  16 +-
 test/operators/test_lrn_op.cpp             | 131 ++++----------
 test/operators/test_mul_op.cpp             | 189 ++++++---------------
 test/operators/test_multiclass_nms_op.cpp  |   6 +-
 test/operators/test_prior_box_op.cpp       |   8 +-
 test/operators/test_relu_op.cpp            |  37 ++--
 test/test_helper.h                         |   6 +
 17 files changed, 278 insertions(+), 547 deletions(-)

diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 7fdb52c435c..b6a7c724ad1 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -219,7 +219,8 @@ class Tensor {
 
   inline void check_memory_size() const {
     PADDLE_MOBILE_ENFORCE(
-        holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+        holder_ != nullptr,
+        "Tensor holds no memory. Call Tensor::mutable_data first.");
     PADDLE_MOBILE_ENFORCE(
         numel() * SizeOfType(type()) <= memory_size(),
         "Tensor's dims_ is out of bound. CallTensor::mutable_data "
diff --git a/src/io.cpp b/src/io.cpp
index f03e9a56e2d..271a3190ae0 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -210,7 +210,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
           tensor->Resize(framework::make_ddim(dim));
         } else {
           auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 1, "dim size is 0");
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
           dim[0] = 1;
           auto tensor = var->GetMutable<framework::LoDTensor>();
           tensor->Resize(framework::make_ddim(dim));
@@ -221,7 +221,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
     }
   }
 
-  //  originProgramDesc->Description("program: ");
+  originProgramDesc->Description("program: ");
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
@@ -380,7 +380,7 @@ void Executor<Dtype, P>::InitMemory() {
                    program_.model_path + "/" + var_desc->Name());
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::Tensor>();
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
           tensor->template mutable_data<Ptype>();
         }
       }
@@ -416,7 +416,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::predict(
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
   framework::Variable *g_feed_value = program_.scope->Var("feed");
-  auto feed_tensor = g_feed_value->GetMutable<framework::LoDTensor>();
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
   feed_tensor->Resize(t.dims());
 
   feed_tensor->ShareDataWith(t);
@@ -434,7 +435,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
   DLOG << "start predict: ";
 
-  framework::Tensor tensor;
+  framework::LoDTensor tensor;
   auto ddim = framework::make_ddim(dims);
 
   auto input_ptr = tensor.mutable_data<Ptype>(ddim);
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index c8ac141f9ca..7a566b6ac8f 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -38,7 +38,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
   output->mutable_data<float>();
-
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
   std::vector<int> paddings = param.Paddings();
@@ -80,7 +79,7 @@ void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   framework::DDim filter_matrix_shape = {filter.dims()[0],
                                          filter.numel() / filter.dims()[0]};
   filter.Resize(filter_matrix_shape);
-  DLOG << " filter.deims() = " << filter.dims();
+  DLOG << " filter.dims() = " << filter.dims();
 
   framework::DDim output_matrix_shape = {
       output->dims()[1],
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 5ac6fc67af5..02bda7147aa 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -207,7 +207,7 @@ class ConvParam : OpParam {
 
   const Tensor *Input() const { return input_; }
 
-  const LoDTensor *Filter() const { return filter_; }
+  const Tensor *Filter() const { return filter_; }
 
   Tensor *Output() const { return output_; }
 
@@ -222,7 +222,7 @@ class ConvParam : OpParam {
  private:
   Tensor *input_;
   Tensor *output_;
-  LoDTensor *filter_;
+  Tensor *filter_;
   vector<int> strides_;
   vector<int> paddings_;
   vector<int> dilations_;
@@ -717,10 +717,10 @@ class FushionFcParam : public OpParam {
  public:
   FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                  const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<Tensor>(inputs, scope);
-    input_y_ = InputYFrom<Tensor>(inputs, scope);
-    input_z_ = InputZFrom<Tensor>(inputs, scope);
-    out_ = OutFrom<Tensor>(outputs, scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    input_z_ = InputZFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
     axis_ = GetAttr<int>("axis", attrs);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 20d6cfe7a78..f464c3bd94f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,11 +11,11 @@ ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_includ
 target_link_libraries(test-mul-op paddle-mobile)
 
 # gen test
-ADD_EXECUTABLE(test-elementwiseadd-op  operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-elementwiseadd-op paddle-mobile)
 
 # gen test
-ADD_EXECUTABLE(test-concat-op  operators/test_concat_op.cpp test_helper.h  test_include.h)
+ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
 target_link_libraries(test-concat-op paddle-mobile)
 
 # gen test
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 35bc71f1101..a54a8bb191a 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "common/log.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
+#include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
 #include "operators/relu_op.h"
 #include "operators/reshape_op.h"
@@ -37,6 +38,7 @@ using paddle_mobile::framework::Program;
 using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::Variable;
 using std::string;
+using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
@@ -73,18 +75,34 @@ class Executor4Test : public Executor<DeviceType> {
     }
   }
 
-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
-                                  const DDim &dDim) {
+  template <typename T = LoDTensor>
+  vector<std::shared_ptr<Tensor>> predict(const vector<Tensor> &ts,
+                                          const vector<string> &input_names,
+                                          const vector<string> &output_names,
+                                          const vector<DDim> &ddims) {
     auto scope = this->program_.scope;
-    Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<Tensor>();
-    tensor->ShareDataWith(t);
+    size_t input_size = input_names.size();
+    size_t out_size = output_names.size();
 
-    Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>(dDim);
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+    vector<Variable *> input_vars(input_size);
+    vector<LoDTensor *> input_tensors(input_size);
+    for (int i = 0; i < input_size; i++) {
+      input_vars[i] = scope->Var(input_names[i]);
+      input_tensors[i] = input_vars[i]->GetMutable<T>();
+      input_tensors[i]->ShareDataWith(ts[i]);
+    }
+
+    vector<Variable *> output_vars(out_size);
+    vector<LoDTensor *> output_tensors(out_size);
+    vector<std::shared_ptr<Tensor>> output_tensor_sptrs(out_size);
+
+    for (int i = 0; i < out_size; i++) {
+      output_vars[i] = scope->Var(output_names[i]);
+      output_tensors[i] = output_vars[i]->GetMutable<T>();
+      output_tensors[i]->mutable_data<float>(ddims[i]);
+      output_tensor_sptrs[i] = std::make_shared<LoDTensor>();
+      output_tensor_sptrs[i].reset(output_tensors[i]);
+    }
 
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
         this->to_predict_program_->Block(0);
@@ -94,6 +112,6 @@ class Executor4Test : public Executor<DeviceType> {
       op->Run();
     }
 
-    return out_tensor;
+    return output_tensor_sptrs;
   }
 };
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index 385617317df..ba2e06b80b4 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -68,27 +68,27 @@ class TestBatchNormOp {
     // feed
     auto scope = program_.scope;
     Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
 
     Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
-    auto tensor_mean = mean_feed_value->GetMutable<Tensor>();
+    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
     tensor_mean->ShareDataWith(t2);
 
     Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
-    auto tensor_scale = scale_feed_value->GetMutable<Tensor>();
+    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
     tensor_scale->ShareDataWith(t3);
 
     Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
-    auto tensor_variance = variance_feed_value->GetMutable<Tensor>();
+    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
     tensor_variance->ShareDataWith(t4);
 
     Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
-    auto tensor_bias = bias_feed_value->GetMutable<Tensor>();
+    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
     tensor_bias->ShareDataWith(t5);
 
     Variable *output = scope->Var("batch_norm_0.tmp_2");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({4, 10, 2, 2});
     //  DLOG << typeid(output_tensor).name();
     //  DLOG << "output_tensor dims: " << output_tensor->dims();
diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp
index dea59e8bf2c..b7695c91dfb 100644
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -62,19 +62,19 @@ class TestBoxCoderOp {
     // feed
     auto scope = program_.scope;
     Variable *prior_box = scope->Var("concat_0.tmp_0");
-    auto tensor_x1 = prior_box->GetMutable<Tensor>();
+    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
 
     Variable *prior_box_var = scope->Var("concat_1.tmp_0");
-    auto tensor_x2 = prior_box_var->GetMutable<Tensor>();
+    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
     tensor_x2->ShareDataWith(t2);
 
     Variable *target_box = scope->Var("concat_2.tmp_0");
-    auto tensor_x3 = target_box->GetMutable<Tensor>();
+    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
     tensor_x3->ShareDataWith(t3);
 
     Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
     boxes_output_tensor->mutable_data<float>({1, 1917, 4});
 
     //  DLOG << typeid(output_tensor).name();
diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp
index 205274ea7ab..a9bb072f1e9 100644
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -12,148 +12,64 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/concat_op.h"
 
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestConcatOp {
- public:
-  explicit TestConcatOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "concat" && op->Input("X")[0] == "conv2d_3.tmp_1") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " axis : " << op->GetAttrMap().at("axis").Get<int>();
-
-          std::shared_ptr<operators::ConcatOp<Dtype, float>> concat =
-              std::make_shared<operators::ConcatOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(concat);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_concat(const Tensor &t1, const Tensor &t2,
-                                         const Tensor &t3, const Tensor &t4) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_3.tmp_1");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("conv2d_5.tmp_1");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *x3_feed_value = scope->Var("conv2d_7.tmp_1");
-    auto tensor_x3 = x3_feed_value->GetMutable<Tensor>();
-    tensor_x3->ShareDataWith(t3);
-
-    Variable *x4_feed_value = scope->Var("conv2d_8.tmp_1");
-    auto tensor_x4 = x4_feed_value->GetMutable<Tensor>();
-    tensor_x4->ShareDataWith(t4);
-
-    Variable *con_output = scope->Var("concat_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({4, 100, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_concat(t1, t2, t3, t4, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_concat(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-                      const Tensor &t4, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestConcatOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ConcatOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-  /// input x (4,20,2,2)
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {4, 20, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-  /// input x (4,30,2,2)
-  paddle_mobile::framework::Tensor inputx3;
-  SetupTensor<float>(&inputx3, {4, 30, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx3_ptr = inputx3.data<float>();
-  /// input x (4,40,2,2)
-  paddle_mobile::framework::Tensor inputx4;
-  SetupTensor<float>(&inputx4, {4, 40, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx4_ptr = inputx4.data<float>();
-
-  paddle_mobile::framework::TestConcatOp<paddle_mobile::CPU> testConcatOp(
-      program);
-
-  auto output_concat =
-      testConcatOp.predict_concat(inputx1, inputx2, inputx3, inputx4);
-  auto *output_concat_ptr = output_concat->data<float>();
-
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
   int input_n = 1;
   int input_c = 2;
   int input_h = 0;
   int input_w = 1;
-  int stride0 = inputx3.numel() / inputx3.dims()[0];
-  int stride1 = inputx3.numel() / inputx3.dims()[0] / inputx3.dims()[1];
-  int stride2 = inputx3.dims()[3];
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
   /// inputx1 (4,10,2,2),
   /// inputx2 (4,20,2,2),
   /// inputx3 (4,30,2,2),
@@ -163,10 +79,10 @@ int main() {
   int input_index =
       input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
   int output_index = input_n * 100 * 2 * 2 +
-                     (input_c + inputx1.dims()[1] + inputx2.dims()[1]) * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
                      input_h * 2 + input_w;
 
-  DLOG << " inputx3[1,2,0,1] = " << inputx3_ptr[input_index];
-  DLOG << " output[1,12,0,1] = " << output_concat_ptr[output_index];
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
   return 0;
 }
diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp
index eeb642a3f48..1b4bf457a2c 100644
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -12,133 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
-#include "operators/elementwise_add_op.h"
 
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestElementwiseAddOp {
- public:
-  explicit TestElementwiseAddOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "elementwise_add" &&
-            op->Input("X")[0] == "batch_norm_2.tmp_2") {
-          DLOG << " elementwise_add attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          Attribute axis_attr = op->GetAttrMap().at("axis");
-          int axis = axis_attr.Get<int>();
-          DLOG << " Attr axis is : " << axis;
-
-          std::shared_ptr<operators::ElementwiseAddOp<Dtype, float>> add =
-              std::make_shared<operators::ElementwiseAddOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(add);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_add(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("batch_norm_2.tmp_2");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("batch_norm_0.tmp_3");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("elementwise_add_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({1, 3, 224, 224});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "elementwise_add");
 
-    predict_add(t1, t2, 0);
-    return out_tensor;
-  }
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
 
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
+  input_tensors.push_back(input1);
 
-  void predict_add(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
+  input_tensors.push_back(input2);
 
-template class TestElementwiseAddOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ElementAddOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../models/"
-                              "image_classification_resnet.inference.model"));
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_2.tmp_2",
+      "batch_norm_0.tmp_3",
+  });
 
-  /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-  /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {224}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
+  // 3. output_names
+  vector<string> output_names({"elementwise_add_0.tmp_0"});
 
-  paddle_mobile::framework::TestElementwiseAddOp<paddle_mobile::CPU>
-      testElementwiseAddOp(program);
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
+  out_ddims.push_back(out_ddim);
 
-  auto output_add = testElementwiseAddOp.predict_add(inputx, inputy);
-  auto *output_add_ptr = output_add->data<float>();
-  //            for (int j = 0; j < output_add->numel(); ++j) {
-  //                DLOG << "value of output: " << output_add_ptr[j];
-  //            }
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
 
+  auto output0_data = output[0]->data<float>();
   /// output (1,3,224,224)
-  DLOG << "output memory size : " << output_add->memory_size();
-  DLOG << "output numel : " << output_add->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();
 
-  DLOG << inputx_ptr[226] << " + " << inputy_ptr[2] << " = "
-       << output_add_ptr[226];
-  return 0;
+  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
+       << output0_data[226];
 }
diff --git a/test/operators/test_fushion_fc_op.cpp b/test/operators/test_fushion_fc_op.cpp
index b52989b2e8b..6063772d85a 100644
--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -64,24 +64,24 @@ class TestFcOp {
     // feed
     auto scope = program_.scope;
     Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
+    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
     tensor_x->ShareDataWith(t1);
 
     Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
+    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
     tensor_y->ShareDataWith(t2);
 
     Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
-    auto tensor_z = z_feed_value->GetMutable<Tensor>();
+    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
     tensor_z->ShareDataWith(t3);
 
     Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({3, 10});
     //  DLOG << typeid(output_tensor).name();
     //  DLOG << "output_tensor dims: " << output_tensor->dims();
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
     out_tensor.reset(output_tensor);
 
     predict(t1, t2, t3, 0);
@@ -130,17 +130,17 @@ int main() {
   }
 
   /// input x (1,3,224,224)
-  paddle_mobile::framework::Tensor inputx;
+  paddle_mobile::framework::LoDTensor inputx;
   SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
                      static_cast<float>(1));
   auto *inputx_ptr = inputx.data<float>();
   /// input y (224,)
-  paddle_mobile::framework::Tensor inputy;
+  paddle_mobile::framework::LoDTensor inputy;
   SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
                      static_cast<float>(1.5));
   auto *inputy_ptr = inputy.data<float>();
 
-  paddle_mobile::framework::Tensor inputz;
+  paddle_mobile::framework::LoDTensor inputz;
   SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *inputz_ptr = inputz.data<float>();
diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp
index 2284b38abc3..ba35639fb71 100644
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -12,118 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/lrn_op.h"
 
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestLrnOp {
- public:
-  explicit TestLrnOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "lrn" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << " n : " << op->GetAttrMap().at("n").Get<int>();
-          DLOG << " alpha : " << op->GetAttrMap().at("alpha").Get<float>();
-          DLOG << " beta : " << op->GetAttrMap().at("beta").Get<float>();
-          DLOG << " k : " << op->GetAttrMap().at("k").Get<float>();
-          std::shared_ptr<operators::LrnOp<Dtype, float>> lrn =
-              std::make_shared<operators::LrnOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_lrn(const Tensor &t1) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *con_output = scope->Var("pool1_norm1.tmp_1");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 4, 2, 2});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
 
-    predict_lrn(t1, 0);
-    return out_tensor;
-  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
+      executor(program, "lrn");
 
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
 
-  void predict_lrn(const Tensor &t1, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
 
-template class TestLrnOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+  });
 
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run LrnOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/googlenet"));
+  // 3. output_names
+  vector<string> output_names({"pool1_norm1.tmp_1"});
 
-  /// input x (3,4,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {3, 4, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
+  out_ddims.push_back(out_ddim);
 
-  paddle_mobile::framework::TestLrnOp<paddle_mobile::CPU> testLrnOp(program);
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
 
-  auto output_lrn = testLrnOp.predict_lrn(inputx1);
-  auto *output_lrn_ptr = output_lrn->data<float>();
+  auto output0_data = output[0]->data<float>();
 
   DLOG << " LrnOp input: ";
   for (int i = 0; i < 3; i++) {
     for (int j = 0; j < 4; j++) {
       for (int c = 0; c < 2; c++) {
         for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", inputx1_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
         }
         DLOGF("\n");
       }
@@ -136,7 +69,7 @@ int main() {
     for (int j = 0; j < 4; j++) {
       for (int c = 0; c < 2; c++) {
         for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", output_lrn_ptr[i * 16 + j * 4 + c * 2 + d]);
+          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
         }
         DLOGF("\n");
       }
@@ -144,8 +77,8 @@ int main() {
     }
     DLOGF("\n");
   }
-  DLOG << inputx1_ptr[0] << " / ((1 + 0.00002 * ( " << inputx1_ptr[0] << "^2 + "
-       << inputx1_ptr[4] << "^2 + " << inputx1_ptr[8] << "^2 ))^0.75) = ";
-  DLOG << output_lrn_ptr[0];
+  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
+       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
+  DLOG << output0_data[0];
   return 0;
 }
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 061a942ed85..8acd4a99470 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,158 +12,81 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
 
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestMulOp {
- public:
-  explicit TestMulOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "mul" && op->Input("X")[0] == "pool2d_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          DLOG << "x_num_col_dims : "
-               << op->GetAttrMap().at("x_num_col_dims").Get<int>();
-          DLOG << "y_num_col_dims : "
-               << op->GetAttrMap().at("y_num_col_dims").Get<int>();
-
-          std::shared_ptr<operators::MulOp<Dtype, float>> mul =
-              std::make_shared<operators::MulOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(mul);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_mul(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("pool2d_0.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<Tensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("fc_0.w_0");
-    auto tensor_y = y_feed_value->GetMutable<Tensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *con_output = scope->Var("fc_0.tmp_0");
-    auto *output_tensor = con_output->GetMutable<Tensor>();
-    output_tensor->mutable_data<float>({3, 3});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_mul(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_mul(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestMulOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
 int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run MulOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program =
-      loader.Load(std::string("../../test/models/"
-                              "image_classification_resnet.inference.model"));
-
-  /// input x (3,2,1,1)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {3, 2, 1, 1}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-
-  /// input y (2,3)
-  paddle_mobile::framework::Tensor inputy;
-  SetupTensor<float>(&inputy, {2, 3}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputy_ptr = inputy.data<float>();
-
-  paddle_mobile::framework::TestMulOp<paddle_mobile::CPU> testMulOp(program);
-
-  auto output_mul = testMulOp.predict_mul(inputx, inputy);
-  auto *output_mul_ptr = output_mul->data<float>();
-
-  auto dimx_1 = inputx.numel() / inputx.dims()[0];
-  DLOG << " inputx : ";
-  for (int i = 0; i < inputx.dims()[0]; ++i) {
-    for (int j = 0; j < dimx_1; ++j) {
-      DLOGF("%f ", inputx_ptr[i * dimx_1 + j]);
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
+      executor(program, "mul");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
+  input_tensors.push_back(input2);
+
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+      "fc_0.w_0",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"fc_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  auto dim_1 = input1.numel() / input1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < input1.dims()[0]; ++i) {
+    for (int j = 0; j < dim_1; ++j) {
+      DLOGF("%f ", input1_data[i * dim_1 + j]);
     }
     DLOGF("\n");
   }
 
-  auto dimy_1 = inputy.numel() / inputy.dims()[0];
-  DLOG << " inputy : ";
-  for (int i = 0; i < inputy.dims()[0]; ++i) {
-    for (int j = 0; j < dimy_1; ++j) {
-      DLOGF("%f ", inputy_ptr[i * dimx_1 + j]);
+  auto dim_2 = input2.numel() / input2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < input2.dims()[0]; ++i) {
+    for (int j = 0; j < dim_2; ++j) {
+      DLOGF("%f ", input2_data[i * dim_2 + j]);
     }
     DLOGF("\n");
   }
 
-  auto dim_output_1 = output_mul->numel() / output_mul->dims()[0];
+  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
   DLOG << " output : ";
-  for (int i = 0; i < output_mul->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output_1; ++j) {
-      DLOGF("%f ", output_mul_ptr[i * dimy_1 + j]);
+  for (int i = 0; i < output[0]->dims()[0]; ++i) {
+    for (int j = 0; j < dim_output0; ++j) {
+      DLOGF("%f ", output0_data[i * dim_2 + j]);
     }
     DLOGF("\n");
   }
 
   /// output (3,3)
-  DLOG << "output memory size : " << output_mul->memory_size();
-  DLOG << "output numel : " << output_mul->numel();
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();
 
-  DLOG << inputx_ptr[0] << " x " << inputy_ptr[0] << " + " << inputx_ptr[1]
-       << " x " << inputy_ptr[0 + 3] << " = " << output_mul_ptr[0];
+  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
+       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
   return 0;
 }
diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp
index 01ad72b9bbc..e6c41bd4b3b 100644
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -77,15 +77,15 @@ class TestMultiClassNMSOp {
     // feed
     auto scope = program_.scope;
     Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
 
     Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
     tensor_x2->ShareDataWith(t2);
 
     Variable *output = scope->Var("detection_output_0.tmp_0");
-    auto *output_tensor = output->GetMutable<Tensor>();
+    auto *output_tensor = output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({1917, 6});
 
     //  DLOG << typeid(output_tensor).name();
diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp
index e365c4ed851..80ede944936 100644
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -72,19 +72,19 @@ class TestPriorBoxOp {
     // feed
     auto scope = program_.scope;
     Variable *x1_feed_value = scope->Var("image");
-    auto tensor_x1 = x1_feed_value->GetMutable<Tensor>();
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
 
     Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
-    auto tensor_x2 = x2_feed_value->GetMutable<Tensor>();
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
     tensor_x2->ShareDataWith(t2);
 
     Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<Tensor>();
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
     boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});
 
     Variable *variances_output = scope->Var("prior_box_1.tmp_1");
-    auto *variances_output_tesnor = variances_output->GetMutable<Tensor>();
+    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
     variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
     //  DLOG << typeid(output_tensor).name();
     //  DLOG << "output_tensor dims: " << output_tensor->dims();
diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp
index 6fefb0368be..fb68b921113 100644
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_include.h"
+#include "operators/relu_op.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet_ssd);
-
+  auto program = loader.Load(g_resnet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
 
@@ -27,17 +26,33 @@ int main() {
                 paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
       executor(program, "relu");
 
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(-1),
-                     static_cast<float>(1));
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
 
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
-  auto output = executor.predict(input, "batch_norm_0.tmp_2",
-                                 "batch_norm_0.tmp_3", out_ddim);
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
 
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
   }
   return 0;
 }
diff --git a/test/test_helper.h b/test/test_helper.h
index e2d6a183cb7..c0c301840fa 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -43,6 +43,12 @@ void SetupTensor(paddle_mobile::framework::Tensor *input,
   }
 }
 
+template <typename T>
+T *CreateInput(Tensor *input, DDim dims, T low, T up) {
+  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));
+  return input->data<T>();
+}
+
 template <typename T>
 void GetInput(const std::string &input_name, std::vector<T> *input,
               const std::vector<int64_t> &dims) {

From 0d837d7adb7c95e6f46ccd8f93682bb33d90fece Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Tue, 29 May 2018 16:13:38 +0800
Subject: [PATCH 04/26] code style

---
 test/test_helper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_helper.h b/test/test_helper.h
index 6b5d8335db0..029ed9742f6 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -29,8 +29,8 @@ static const std::string g_resnet =
     "../models/image_classification_resnet.inference.model";
 static const std::string g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
-using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Tensor;
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                  paddle_mobile::framework::DDim dims, T lower, T upper) {

From 9f94c0e916958fa2a950496705612d288ea2f5f7 Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Tue, 29 May 2018 18:14:50 +0800
Subject: [PATCH 05/26] restore test for single input and output

---
 test/executor_for_test.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index a54a8bb191a..89b54617826 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -114,4 +114,29 @@ class Executor4Test : public Executor<DeviceType> {
 
     return output_tensor_sptrs;
   }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
+                                  const DDim &dDim) {
+    auto scope = this->program_.scope;
+    Variable *g_feed_value = scope->Var(input);
+    auto tensor = g_feed_value->GetMutable<LoDTensor>();
+    tensor->ShareDataWith(t);
+
+    Variable *con_output = scope->Var(output);
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>(dDim);
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
+         ++j) {
+      auto op = this->ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+
+    return out_tensor;
+  }
 };

From 5220e87e26926f4a7272ef8df94e38e9871e2bfd Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Tue, 29 May 2018 20:29:56 +0800
Subject: [PATCH 06/26] modify operator interface for printing tensor array

---
 src/framework/operator.cpp           |  36 +++++++++-
 src/framework/operator.h             |  11 ++-
 src/framework/tensor.h               | 102 ++++++++++++++-------------
 src/io.cpp                           |  18 ++---
 src/operators/batchnorm_op.h         |  11 +--
 src/operators/box_coder_op.h         |   2 +-
 src/operators/concat_op.h            |  10 +--
 src/operators/conv_op.h              |   7 +-
 src/operators/elementwise_add_op.h   |  13 ++--
 src/operators/feed_op.h              |   7 +-
 src/operators/fetch_op.h             |  11 ++-
 src/operators/fusion_fc_op.h         |  12 ++--
 src/operators/lrn_op.h               |  10 +--
 src/operators/mul_op.h               |   6 +-
 src/operators/multiclass_nms_op.h    |   2 +-
 src/operators/pool_op.h              |  22 +++---
 src/operators/prior_box_op.h         |   2 +-
 src/operators/relu_op.h              |   2 +-
 src/operators/reshape_op.h           |   2 +-
 src/operators/sigmoid_op.h           |   2 +-
 src/operators/softmax_op.h           |   2 +-
 src/operators/transpose_op.h         |   2 +-
 test/executor_for_test.h             |   2 +-
 test/framework/test_load.cpp         |   2 +-
 test/framework/test_optimize.cpp     |   2 +-
 test/net/test_googlenet.cpp          |   2 +-
 test/operators/test_pool_op.cpp      |   2 +-
 test/operators/test_reshape_op.cpp   |   2 +-
 test/operators/test_sigmoid_op.cpp   |   2 +-
 test/operators/test_softmax_op.cpp   |   2 +-
 test/operators/test_transpose_op.cpp |   2 +-
 test/test_include.h                  |   2 +-
 32 files changed, 175 insertions(+), 137 deletions(-)

diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 857e8ea1c77..dfdf0af79ac 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -13,11 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/operator.h"
-#include "framework/op_info.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace framework {
 
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetOutKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+  }
+  return it->second.second;
+}
+
+template <typename T>
+static T *GetVarValue(const string &key, const VariableNameMap &var_map,
+                      const Scope &scope) {
+  auto var_vec = var_map.at(key);
+  if (!var_vec.empty()) {
+    auto var = scope.FindVar(var_vec[0]);
+    return var->GetMutable<T>();
+  } else {
+    return nullptr;
+  }
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -31,9 +52,22 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
       scope_(scope) {
   CheckAllInputOutputSet();
 }
+
 template <typename Dtype>
 void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 
+template <typename Dtype>
+void OperatorBase<Dtype>::Run() const {
+  RunImpl();
+#ifdef PADDLE_MOBILE_DEBUG
+  vector<string> output_keys = GetOutKeys();
+  for (const auto key : output_keys) {
+    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
+    DLOG << type_ << " output- " << key << "=" << *out_;
+  }
+#endif
+}
+
 template class OperatorBase<CPU>;
 template class OperatorWithKernel<CPU>;
 
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 5a40a926630..549916b9a38 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -36,6 +36,8 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace framework {
+using std::string;
+using std::vector;
 static std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {{"conv2d", {{"Input"}, {"Output"}}},
@@ -57,7 +59,9 @@ class OperatorBase : PaddleMobileObject {
                const VariableNameMap &outputs, const AttributeMap &attrs,
                std::shared_ptr<Scope> scope);
   virtual ~OperatorBase() {}
-  virtual void Run() const = 0;
+  void Run() const;
+  vector<string> GetOutKeys() const;
+  virtual void RunImpl() const = 0;
   virtual void InferShape() const = 0;
 
   const VariableNameMap &Inputs() const { return inputs_; }
@@ -88,7 +92,8 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      std::shared_ptr<Scope> scope)
       : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
-  virtual void Run() const = 0;
+
+  virtual void RunImpl() const = 0;
   virtual void InferShape() const = 0;
 };
 
@@ -113,7 +118,7 @@ class FusionOpMatcher : PaddleMobileObject {
 
   virtual std::string Type() = 0;
 
-  virtual void FolderNodes(Node &node) {
+  virtual void FolderNodes(const Node &node) {
     node.Folder(node_.Depth(), Type(), {});
   }
 
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index b6a7c724ad1..674edd67733 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -18,11 +18,12 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <type_traits>
 #include <typeindex>
 #include <vector>
 
-#include "data_layout.h"
-#include "ddim.h"
+#include "framework/data_layout.h"
+#include "framework/ddim.h"
 #include "memory/t_malloc.h"
 
 namespace paddle_mobile {
@@ -62,8 +63,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 static inline size_t SizeOfType(std::type_index type) {
   SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
   size_t size = functor(type);
-  //  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s",
-  //  type.name());
+
+  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
 }
 
@@ -72,16 +73,27 @@ class LoDTensor;
 class Tensor {
  public:
   Tensor() : offset_(0) {}
+  template <typename T>
+  Tensor(std::vector<T> input, DDim ddim) : offset_(0) {
+    PADDLE_MOBILE_ENFORCE(
+        input.size() == framework::product(ddim),
+        "input vector'length should be equal to tensor's length");
+    auto input_ptr = mutable_data<T>(ddim);
+    for (int i = 0; i < input.size(); ++i) {
+      input_ptr[i] = input[i];
+    }
+  }
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T *data() {
     check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() ==
-    //                     typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s",
+        this->holder_->type().name());
+
     return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
   }
@@ -90,11 +102,11 @@ class Tensor {
   template <typename T>
   inline const T *data() const {
     check_memory_size();
-    //  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-    //                     holder_->type().hash_code() ==
-    //                     typeid(T).hash_code(),
-    //                 "Tensor holds the wrong type, it holds %s",
-    //                 this->holder_->type().name());
+    PADDLE_MOBILE_ENFORCE(
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s",
+        this->holder_->type().name());
 
     return reinterpret_cast<const T *>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -116,17 +128,11 @@ class Tensor {
     if (holder_ != nullptr) {
       holder_->set_type(type);
     }
-    //  PADDLE_ENFORCE_GE(numel(), 0,
-    //                    "When calling this method, the Tensor's
-    //                    numel must be
-    //                    " "equal or larger than zero. " "Please
-    //                    check
-    //                    Tensor::Resize has been called first.");
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
     int64_t size = numel() * SizeOfType(type);
     /* some versions of boost::variant don't have operator!= */
     if (holder_ == nullptr || holder_->size() < size + offset_) {
       holder_.reset(new PlaceholderImpl(size, type));
-
       offset_ = 0;
     }
     return reinterpret_cast<void *>(
@@ -179,16 +185,13 @@ class Tensor {
    */
   inline Tensor Slice(int begin_idx, int end_idx) const {
     check_memory_size();
-    //  PADDLE_ENFORCE_GE(begin_idx, 0,
-    //                    "The start row index must be greater than
-    //                    0.");
-    //  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is
-    //  out of
-    //  bound."); PADDLE_ENFORCE_LT(
-    //      begin_idx, end_idx,
-    //      "The start row index must be lesser than the end row
-    //      index.");
-
+    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
+                          "The start row index must be greater than 0.")
+    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
+                          "The end row index is out of bound.")
+    PADDLE_MOBILE_ENFORCE(
+        begin_idx < end_idx,
+        "The start row index must be lesser than the end row index")
     if (dims_[0] == 1) {
       return *this;
     } else {
@@ -205,10 +208,9 @@ class Tensor {
   }
 
   std::type_index type() const {
-    //                PADDLE_ENFORCE_NOT_NULL(
-    //                        holder_, "Tensor not initialized yet
-    //                        when
-    //                        Tensor::type() is called.");
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor not initialized yet when Tensor::type() is called.")
     return holder_->type();
   }
 
@@ -221,12 +223,8 @@ class Tensor {
     PADDLE_MOBILE_ENFORCE(
         holder_ != nullptr,
         "Tensor holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_MOBILE_ENFORCE(
-        numel() * SizeOfType(type()) <= memory_size(),
-        "Tensor's dims_ is out of bound. CallTensor::mutable_data "
-        "first to re-allocate memory.\n"
-        "or maybe the required data-type mismatches the data\
-          already stored.");
+    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
+                          "Tensor's dims_ is out of bound. ");
   }
 
   inline DataLayout layout() const { return layout_; }
@@ -257,13 +255,8 @@ class Tensor {
                memory::PODDeleter<uint8_t>()),
           size_(size),
           type_(type) {
-      //                    PADDLE_ENFORCE_NOT_NULL(ptr_,
-      //                    "Insufficient %s
-      //                    memory to allocation.",
-      //                                            (is_cpu_place(place_)
-      //                                            ?
-      //                                            "CPU" :
-      //                                            "GPU"));
+      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
+                            "Insufficient memory to allocation");
     }
 
     virtual size_t size() const { return size_; }
@@ -321,6 +314,19 @@ class Tensor {
   size_t offset_;
 };
 
+#ifdef PADDLE_MOBILE_DEBUG
+inline Print &operator<<(Print &printer, const Tensor &tensor) {
+  printer << " dims: " << tensor.dims() << "\n";
+  int stride = tensor.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < tensor.numel(); i += stride) {
+    printer << tensor.data<float>()[i] << " ";
+  }
+  return printer;
+}
+
+#endif
+
 inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/src/io.cpp b/src/io.cpp
index 1c5e97bbb7e..bfb3c5a7e2b 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io.h"
+#include "/io.h"
 #include <fstream>
 #include <vector>
-
 #include "common/enforce.h"
 #include "common/log.h"
 #include "framework/framework.pb-c.h"
@@ -53,7 +52,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 
   DLOG << "model size: " << size;
 
-  *out = (uint8_t *)malloc(size);
+  *out = reinterpret_cast<uint8_t *>(size);
 
   size_t cur_len = 0;
   size_t nread;
@@ -364,7 +363,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
 
   is.read(static_cast<char *>(memory), memory_size * type_size);
   is.close();
-};
+}
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InitMemory() {
@@ -381,6 +380,7 @@ void Executor<Dtype, P>::InitMemory() {
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
           auto tensor = var->template GetMutable<framework::LoDTensor>();
+
           tensor->template mutable_data<Ptype>();
         }
       }
@@ -406,15 +406,7 @@ void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
 template <typename Dtype, Precision P>
 std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  DLOG << "start predict: ";
-
-  framework::LoDTensor tensor;
-  auto ddim = framework::make_ddim(dims);
-
-  auto input_ptr = tensor.mutable_data<Ptype>(ddim);
-  for (int i = 0; i < input.size(); ++i) {
-    input_ptr[i] = input[i];
-  }
+  framework::Tensor tensor(input, framework::make_ddim(dims));
 
   predict(tensor, 0);
 
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 072fbd5f424..760466eeddc 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/batchnorm_kernel.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
  public:
-  BatchNormOp(const std::string &type, const VariableNameMap &inputs,
+  BatchNormOp(const string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
               const framework::AttributeMap attrs,
               std::shared_ptr<framework::Scope> scope)
@@ -32,7 +33,7 @@ class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::BatchNormKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index 76f4b151742..a2203e1d89f 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -36,7 +36,7 @@ class BoxCoderOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::BoxCoderKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 611e46af6a6..15160e20a40 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/concat_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ConcatOp : public framework::OperatorWithKernel<DeviceType> {
  public:
-  ConcatOp(const std::string &type, const VariableNameMap &inputs,
+  ConcatOp(const string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const framework::AttributeMap attrs,
            std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::ConcatKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index 047fa1a8e6c..1557f2f06ee 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -14,14 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/conv_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ConvOp : public framework::OperatorWithKernel<DeviceType> {
  public:
@@ -35,7 +34,7 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
   using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
   void InferShape() const override;
 
-  void Run() const {
+  void RunImpl() const {
     operators::ConvKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
     this->ClearVariables({"Filter", "Input"});
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 47fa52c4696..7dd7e147a06 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -12,19 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
+#include <string>
 #include "framework/operator.h"
 #include "kernel/elementwise_add_kernel.h"
-#include "op_param.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
  public:
-  ElementwiseAddOp(const std::string &type, const VariableNameMap &inputs,
+  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap attrs,
                    std::shared_ptr<framework::Scope> scope)
@@ -32,7 +33,7 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::ElementwiseAddKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 426d5f6220d..25a82894ea9 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -14,22 +14,23 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
+using std::string;
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
  public:
-  FeedOp(const std::string &type, const VariableNameMap &inputs,
+  FeedOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
         param_(inputs, outputs, attrs, *scope) {}
-  void Run() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
   void InferShape() const {
     auto out_dims = param_.Out()->dims();
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index 7dddd679929..31e17f2b562 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -14,27 +14,24 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
+using std::string;
 
 template <typename DeviceType, typename T>
 class FetchOp : public framework::OperatorBase<DeviceType> {
  public:
-  FetchOp(const std::string &type, const VariableNameMap &inputs,
+  FetchOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap attrs,
           std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
         param_(inputs, outputs, attrs, *scope) {}
-  void Run() const {
-    param_.Out()->ShareDataWith(*param_.InputX());
-    for (int i = 0; i < param_.Out()->numel(); ++i) {
-      DLOG << param_.Out()->template data<float>()[i];
-    }
-  }
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
   void InferShape() const {
     auto x_dims = param_.InputX()->dims();
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 1dd5d2bf535..6e0c50170a1 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
@@ -22,7 +23,8 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
-
+using std::string;
+using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
  public:
   FusionFcMatcher() {
@@ -30,8 +32,8 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
     node_ > std::make_shared<framework::Node>("elementwise_add");
   }
 
-  void FolderNodes(framework::Node &node) {
-    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+  void FolderNodes(const framework::Node &node) {
+    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
     node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
   }
@@ -42,7 +44,7 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
  public:
-  FushionFcOp(const std::string &type, const VariableNameMap &inputs,
+  FushionFcOp(const string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
               const framework::AttributeMap attrs,
               std::shared_ptr<framework::Scope> scope)
@@ -50,7 +52,7 @@ class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::FushionFcKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index 112053b97f9..e5d98e1bb10 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -11,27 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/lrn_kernel.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-using namespace framework;
-
+using std::string;
 template <typename DeviceType, typename T>
 class LrnOp : public framework::OperatorWithKernel<DeviceType> {
  public:
-  LrnOp(const std::string &type, const VariableNameMap &inputs,
+  LrnOp(const string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::LrnKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index 8685651ea68..ded618551fc 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/mul_kernel.h"
 #include "operators/op_param.h"
@@ -19,8 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class MulOp : public framework::OperatorWithKernel<DeviceType> {
  public:
@@ -31,7 +31,7 @@ class MulOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::MulKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index 40466af6074..c424856b8cd 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -36,7 +36,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::MultiClassNMSKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 3cc1facbef4..7195c3b4e17 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -17,25 +17,25 @@ limitations under the License. */
 #include <framework/operator.h>
 #include <operators/kernel/pool_kernel.h>
 #include <operators/op_param.h>
+#include <string>
 
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
-
+using framework::AttributeMap;
+using framework::Scope;
+using std::string;
 template <typename DeviceType, typename T>
-class PoolOp : public framework::OperatorWithKernel<DeviceType> {
+class PoolOp : public OperatorWithKernel<DeviceType> {
  public:
-  PoolOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
+  PoolOp(const string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs,
+         std::shared_ptr<Scope> scope)
+      : OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, scope),
         param_(inputs, outputs, attrs, *scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using OperatorWithKernel<DeviceType>::OperatorWithKernel;
   void InferShape() const override;
 
-  void Run() const {
-    //        InferShape();
+  void RunImpl() const {
     operators::PoolKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
     this->ClearVariables({"X"});
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 17a583cac96..84481e602a6 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -36,7 +36,7 @@ class PriorBoxOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::PriorBoxKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 26bee848c1b..6c3a614a1a0 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -35,7 +35,7 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::ReluKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index 62bcb3a6798..b244e62a930 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -35,7 +35,7 @@ class ReshapeOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::ReshapeKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index ba5d3d0299f..f631ba51759 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -36,7 +36,7 @@ class SigmoidOp : public framework::OperatorWithKernel<DeviceType> {
 
   void InferShape() const override;
 
-  void Run() const {
+  void RunImpl() const {
     operators::SigmoidKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
     this->ClearVariables({"X"});
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 550a7698f96..07fd9b945cb 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -36,7 +36,7 @@ class SoftmaxOp : public framework::OperatorWithKernel<DeviceType> {
 
   void InferShape() const override;
 
-  void Run() const {
+  void RunImpl() const {
     operators::SoftmaxKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
     this->ClearVariables({"X"});
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index a56771b4c64..0f673395332 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -36,7 +36,7 @@ class TransposeOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-  void Run() const {
+  void RunImpl() const {
     operators::TransposeKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
   }
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 89b54617826..1eac6530209 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "./io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
+#include "io/io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 0370e6d946f..cae699b792f 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index c721c453739..6681ce83bb5 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index ee03ed0b146..363825fe726 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "../test_helper.h"
 #include "../test_include.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index 85c4f6106da..d92fb66efd7 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp
index 7ba2faa47df..d0cb9ac2df0 100644
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index adf03761327..4ed3efaf28a 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index ed5a1a49f55..e0a616c9a46 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp
index ffdb34f2f50..4ca05d612b7 100644
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "./io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/test_include.h b/test/test_include.h
index 25efbb9f4c0..dd4bf5d127d 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -29,4 +29,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io.h"
+#include "io/io.h"

From 17f097807a935a992734d3d06ae3ff75803d9f52 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 10:52:02 +0800
Subject: [PATCH 07/26] modify code style

---
 src/framework/operator.h     | 2 +-
 src/{ => io}/io.cpp          | 2 +-
 src/{ => io}/io.h            | 0
 src/operators/fusion_fc_op.h | 2 +-
 src/operators/pool_op.h      | 1 +
 5 files changed, 4 insertions(+), 3 deletions(-)
 rename src/{ => io}/io.cpp (99%)
 rename src/{ => io}/io.h (100%)

diff --git a/src/framework/operator.h b/src/framework/operator.h
index 549916b9a38..a44d264a188 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -118,7 +118,7 @@ class FusionOpMatcher : PaddleMobileObject {
 
   virtual std::string Type() = 0;
 
-  virtual void FolderNodes(const Node &node) {
+  virtual void FolderNodes(Node &node) {
     node.Folder(node_.Depth(), Type(), {});
   }
 
diff --git a/src/io.cpp b/src/io/io.cpp
similarity index 99%
rename from src/io.cpp
rename to src/io/io.cpp
index bfb3c5a7e2b..a773939505d 100644
--- a/src/io.cpp
+++ b/src/io/io.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "/io.h"
+#include "io/io.h"
 #include <fstream>
 #include <vector>
 #include "common/enforce.h"
diff --git a/src/io.h b/src/io/io.h
similarity index 100%
rename from src/io.h
rename to src/io/io.h
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 6e0c50170a1..0ed5a2b4d5e 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -32,7 +32,7 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
     node_ > std::make_shared<framework::Node>("elementwise_add");
   }
 
-  void FolderNodes(const framework::Node &node) {
+  void FolderNodes(framework::Node &node) {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
     node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 7195c3b4e17..ff44771c561 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 using framework::AttributeMap;
+using framework::OperatorWithKernel;
 using framework::Scope;
 using std::string;
 template <typename DeviceType, typename T>

From 9f48fe1343ed651f71dc6f0ebcbfe790099ac018 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 11:47:10 +0800
Subject: [PATCH 08/26] add timer for debug

---
 src/io/io.cpp                        | 421 ---------------------------
 src/io/io.h                          |  71 -----
 test/executor_for_test.h             |   2 +-
 test/framework/test_load.cpp         |   2 +-
 test/framework/test_optimize.cpp     |   2 +-
 test/net/test_googlenet.cpp          |  12 +-
 test/operators/test_pool_op.cpp      |   2 +-
 test/operators/test_reshape_op.cpp   |   2 +-
 test/operators/test_sigmoid_op.cpp   |   2 +-
 test/operators/test_softmax_op.cpp   |   2 +-
 test/operators/test_transpose_op.cpp |   2 +-
 test/test_helper.h                   |  13 +
 test/test_include.h                  |   2 +-
 13 files changed, 28 insertions(+), 507 deletions(-)
 delete mode 100644 src/io/io.cpp
 delete mode 100644 src/io/io.h

diff --git a/src/io/io.cpp b/src/io/io.cpp
deleted file mode 100644
index a773939505d..00000000000
--- a/src/io/io.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/io.h"
-#include <fstream>
-#include <vector>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-using framework::Variable;
-
-void ReadBinaryFile(const std::string &filename, std::string *contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_MOBILE_ENFORCE(fin.is_open(), "open file: %s failed",
-                        filename.c_str());
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-
-  *out = reinterpret_cast<uint8_t *>(size);
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Dtype, Precision P>
-void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
-                               const framework::VarDesc &var_desc,
-                               const std::string &file_path) {
-  auto tensor = variable->GetMutable<framework::LoDTensor>();
-  std::ifstream is(file_path);
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
-                        file_path.c_str());
-
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
-
-  // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-  // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
-    }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
-
-  // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
-  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
-
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor_desc = NULL;
-  //  void *v;
-  //  PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure()(tensor_desc,
-  //  buf.get());
-
-  //  DLOG << "PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure- " <<
-  //  tensor_desc;
-
-  //  framework::TensorDesc &tensor_desc = variable->
-  //  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  //  uint8_t *proto_buf = NULL;
-  //  size_t read_size = ReadBuffer(file_path.c_str(), &proto_buf);
-  //  c_program = paddle_mobile__framework__proto__program_desc__unpack(NULL,
-  //  read_size, buf);
-
-  //  paddle_mobile__framework__proto__var_type__tensor_desc__init()
-
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-
-  void *memory = tensor;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
-    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
-      break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_INT32:
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
-      break;
-    default:
-      break;
-  }
-
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  is.close();
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname) {
-  std::string model_filename = dirname + "/__model__";
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-
-  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
-      std::make_shared<framework::ProgramDesc>(c_program);
-
-  framework::Program<Dtype, P> program;
-  program.model_path = dirname;
-  program.originProgram = originProgramDesc;
-
-  std::shared_ptr<framework::Scope> scope =
-      std::make_shared<framework::Scope>();
-  program.scope = scope;
-  originProgramDesc->Block(0);
-
-  for (const auto &block : originProgramDesc->Blocks()) {
-    for (int i = 0; i < block->Vars().size(); ++i) {
-      std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
-      //      DLOG << "var name-- " << var_desc->Name();
-      auto var = scope->Var(var_desc->Name());
-
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          //          DLOG << "to load var ";
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm): some.
-      }
-    }
-  }
-
-  originProgramDesc->Description("program: ");
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  return program;
-}
-
-template class Loader<CPU, Precision::FP32>;
-
-#pragma mark - executor
-
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
-    : program_(p), batch_size_(batch_size) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-  Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr[0].SetValue<int>(batch_size);
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor,
-                                    const std::string &file_path) {
-  std::ifstream is(file_path);
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
-                        file_path.c_str());
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
-
-  // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-  // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
-    }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
-
-  // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
-  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
-
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-
-  void *memory = tensor;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
-    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
-      break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_INT32:
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
-      break;
-    default:
-      break;
-  }
-
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  is.close();
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          continue;
-        }
-        LoadMemory(*var_desc, tensor,
-                   program_.model_path + "/" + var_desc->Name());
-      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::LoDTensor>();
-
-          tensor->template mutable_data<Ptype>();
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(block_id);
-  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-    auto op = ops_of_block_[*to_predict_block.get()][j];
-    op->Run();
-  }
-}
-
-template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
-    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  framework::Tensor tensor(input, framework::make_ddim(dims));
-
-  predict(tensor, 0);
-
-  framework::Variable *g_feed_value = program_.scope->Var("col");
-  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
-
-  return {};
-}
-
-template class Executor<CPU, Precision::FP32>;
-
-}  // namespace paddle_mobile
diff --git a/src/io/io.h b/src/io/io.h
deleted file mode 100644
index 678441a9e05..00000000000
--- a/src/io/io.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory.h>
-#include <string>
-#include <vector>
-
-#include "common/types.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/paddle_mobile_object.h"
-#include "framework/program/program.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-
-template <typename Dtype, Precision P = Precision::FP32>
-class Loader : PaddleMobileObject {
- public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
-
- private:
-  void LoadVar(framework::Variable *variable,
-               const framework::VarDesc &var_desc,
-               const std::string &file_path);
-};
-
-template <typename Dtype, Precision P = Precision::FP32>
-class Executor {
- public:
-  typedef typename PrecisionTrait<P>::ptype Ptype;
-
-  Executor() = default;
-
-  Executor(const framework::Program<Dtype> p);
-
-  Executor(const framework::Program<Dtype> p, int batch_size);
-
-  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
-
-  std::vector<Ptype> predict(const std::vector<Ptype> &input,
-                             const std::vector<int64_t> &dims);
-
- protected:
-  void InitMemory();
-  void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, const std::string &file_path);
-  framework::Program<Dtype> program_;
-  int batch_size_ = 1;
-  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void predict(const framework::Tensor &t, int block_id);
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-};
-
-}  // namespace paddle_mobile
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 1eac6530209..c69eba222fb 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "common/io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io/io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index cae699b792f..fe403b55a18 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 6681ce83bb5..4c4dc6eb3ee 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "common/io.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 363825fe726..d52f080277a 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-
 #include "../test_helper.h"
 #include "../test_include.h"
-#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
+  auto time1 = time();
   auto program = loader.Load(std::string("../models/googlenet"));
-
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
   paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
 
   std::vector<float> input;
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-
-  //  DLOG << " input: " << input;
+  auto time3 = time();
   executor.predict(input, dims);
-
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index d92fb66efd7..8a1c0a7ccec 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp
index d0cb9ac2df0..b0251e693a7 100644
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 4ed3efaf28a..dcd35cd8e46 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index e0a616c9a46..094c48adbb6 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp
index 4ca05d612b7..23e3bc3ec47 100644
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "common/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/test_helper.h b/test/test_helper.h
index 029ed9742f6..dba4dec9bbc 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <chrono>
 #include <fstream>
 #include <random>
 
@@ -31,6 +32,18 @@ static const std::string g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+
+Time time() { return std::chrono::high_resolution_clock::now(); }
+
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                  paddle_mobile::framework::DDim dims, T lower, T upper) {
diff --git a/test/test_include.h b/test/test_include.h
index dd4bf5d127d..19a9bff8846 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "./test_helper.h"
 #include "common/enforce.h"
+#include "common/io.h"
 #include "common/log.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -29,4 +30,3 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io/io.h"

From abc98f5c6c29ebbeac695f282358aff7cb86cab3 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 13:02:53 +0800
Subject: [PATCH 09/26] commit io files

---
 src/common/io.cpp | 421 ++++++++++++++++++++++++++++++++++++++++++++++
 src/common/io.h   |  71 ++++++++
 2 files changed, 492 insertions(+)
 create mode 100644 src/common/io.cpp
 create mode 100644 src/common/io.h

diff --git a/src/common/io.cpp b/src/common/io.cpp
new file mode 100644
index 00000000000..4f9309aa108
--- /dev/null
+++ b/src/common/io.cpp
@@ -0,0 +1,421 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io.h"
+#include <fstream>
+#include <vector>
+#include "common/enforce.h"
+#include "common/log.h"
+#include "framework/framework.pb-c.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/program/program_desc.h"
+#include "framework/program/var_desc.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+using framework::Variable;
+
+void ReadBinaryFile(const std::string &filename, std::string *contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_MOBILE_ENFORCE(fin.is_open(), "open file: %s failed",
+                        filename.c_str());
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  printf("%s \n", file_name);
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
+
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  rewind(fp);
+
+  DLOG << "model size: " << size;
+
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+template <typename Dtype, Precision P>
+void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
+                               const framework::VarDesc &var_desc,
+                               const std::string &file_path) {
+  auto tensor = variable->GetMutable<framework::LoDTensor>();
+  std::ifstream is(file_path);
+  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
+                        file_path.c_str());
+
+  std::fpos<mbstate_t> pos;
+  pos = is.tellg();  // save   current   position
+  is.seekg(0, std::ios::end);
+  is.seekg(pos);  // restore   saved   position
+
+  // 1. version
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+
+  // 2 Lod information
+  uint64_t lod_level;
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+  auto &lod = *tensor->mutable_lod();
+  lod.resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::vector<size_t> tmp(size / sizeof(size_t));
+    is.read(reinterpret_cast<char *>(tmp.data()),
+            static_cast<std::streamsize>(size));
+    for (auto j : tmp) {
+      LOG(kLOG_DEBUG1) << "    lod - " << j;
+    }
+    lod[i] = tmp;
+  }
+
+  // 3. tensor version
+  uint32_t tensor_version;
+  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
+
+  // 4. tensor desc
+  int32_t size;
+  is.read(reinterpret_cast<char *>(&size), sizeof(size));
+  std::unique_ptr<char[]> buf(new char[size]);
+  is.read(reinterpret_cast<char *>(buf.get()), size);
+
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor_desc = NULL;
+  //  void *v;
+  //  PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure()(tensor_desc,
+  //  buf.get());
+
+  //  DLOG << "PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure- " <<
+  //  tensor_desc;
+
+  //  framework::TensorDesc &tensor_desc = variable->
+  //  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  //  uint8_t *proto_buf = NULL;
+  //  size_t read_size = ReadBuffer(file_path.c_str(), &proto_buf);
+  //  c_program = paddle_mobile__framework__proto__program_desc__unpack(NULL,
+  //  read_size, buf);
+
+  //  paddle_mobile__framework__proto__var_type__tensor_desc__init()
+
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+
+  tensor->Resize(framework::make_ddim(desc.Dims()));
+
+  void *memory = tensor;
+  int type_size = 0;
+  switch (desc.DataType()) {
+    case framework::VARTYPE_TYPE_FP16:
+      type_size = 2;
+      break;
+    case framework::VARTYPE_TYPE_FP32:
+      type_size = 4;
+      memory = tensor->mutable_data<float>();
+      break;
+    case framework::VARTYPE_TYPE_FP64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_INT32:
+      type_size = 4;
+      break;
+    case framework::VARTYPE_TYPE_INT64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_BOOL:
+      type_size = 1;
+      break;
+    default:
+      break;
+  }
+
+  is.read(static_cast<char *>(memory), memory_size * type_size);
+  is.close();
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname) {
+  std::string model_filename = dirname + "/__model__";
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  uint8_t *buf = NULL;
+  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
+
+  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      NULL, read_size, buf);
+//
+  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
+//
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+//
+  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
+      std::make_shared<framework::ProgramDesc>(c_program);
+
+  framework::Program<Dtype, P> program;
+  program.model_path = dirname;
+  program.originProgram = originProgramDesc;
+
+  std::shared_ptr<framework::Scope> scope =
+      std::make_shared<framework::Scope>();
+  program.scope = scope;
+  originProgramDesc->Block(0);
+
+  for (const auto &block : originProgramDesc->Blocks()) {
+    for (int i = 0; i < block->Vars().size(); ++i) {
+      std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
+      //      DLOG << "var name-- " << var_desc->Name();
+      auto var = scope->Var(var_desc->Name());
+
+      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable() &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
+          //          DLOG << "to load var ";
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+
+  originProgramDesc->Description("program: ");
+
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
+  return program;
+}
+
+template class Loader<CPU, Precision::FP32>;
+
+#pragma mark - executor
+
+template <typename Dtype, Precision P>
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
+  if (use_optimize_) {
+    to_predict_program_ = program_.optimizeProgram;
+  } else {
+    to_predict_program_ = program_.originProgram;
+  }
+
+  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
+      to_predict_program_->Blocks();
+  for (int i = 0; i < blocks.size(); ++i) {
+    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
+    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
+    for (int j = 0; j < ops.size(); ++j) {
+      std::shared_ptr<framework::OpDesc> op = ops[j];
+      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
+          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
+          program_.scope);
+      op_base->InferShape();
+      ops_of_block_[*block_desc.get()].push_back(op_base);
+    }
+  }
+  InitMemory();
+}
+
+template <typename Dtype, Precision P>
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
+    : program_(p), batch_size_(batch_size) {
+  if (use_optimize_) {
+    to_predict_program_ = program_.optimizeProgram;
+  } else {
+    to_predict_program_ = program_.originProgram;
+  }
+  Variable *variable_ptr = program_.scope->Var("batch_size");
+  variable_ptr[0].SetValue<int>(batch_size);
+  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
+      to_predict_program_->Blocks();
+  for (int i = 0; i < blocks.size(); ++i) {
+    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
+    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
+    for (int j = 0; j < ops.size(); ++j) {
+      std::shared_ptr<framework::OpDesc> op = ops[j];
+      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
+          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
+          program_.scope);
+      op_base->InferShape();
+
+      ops_of_block_[*block_desc.get()].push_back(op_base);
+    }
+  }
+  InitMemory();
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+                                    framework::LoDTensor *tensor,
+                                    const std::string &file_path) {
+  std::ifstream is(file_path);
+  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
+                        file_path.c_str());
+  std::fpos<mbstate_t> pos;
+  pos = is.tellg();  // save   current   position
+  is.seekg(0, std::ios::end);
+  is.seekg(pos);  // restore   saved   position
+
+  // 1. version
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+
+  // 2 Lod information
+  uint64_t lod_level;
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+  auto &lod = *tensor->mutable_lod();
+  lod.resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::vector<size_t> tmp(size / sizeof(size_t));
+    is.read(reinterpret_cast<char *>(tmp.data()),
+            static_cast<std::streamsize>(size));
+    for (auto j : tmp) {
+      LOG(kLOG_DEBUG1) << "    lod - " << j;
+    }
+    lod[i] = tmp;
+  }
+
+  // 3. tensor version
+  uint32_t tensor_version;
+  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
+
+  // 4. tensor desc
+  int32_t size;
+  is.read(reinterpret_cast<char *>(&size), sizeof(size));
+  std::unique_ptr<char[]> buf(new char[size]);
+  is.read(reinterpret_cast<char *>(buf.get()), size);
+
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+
+  tensor->Resize(framework::make_ddim(desc.Dims()));
+
+  void *memory = tensor;
+  int type_size = 0;
+  switch (desc.DataType()) {
+    case framework::VARTYPE_TYPE_FP16:
+      type_size = 2;
+      break;
+    case framework::VARTYPE_TYPE_FP32:
+      type_size = 4;
+      memory = tensor->mutable_data<float>();
+      break;
+    case framework::VARTYPE_TYPE_FP64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_INT32:
+      type_size = 4;
+      break;
+    case framework::VARTYPE_TYPE_INT64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_BOOL:
+      type_size = 1;
+      break;
+    default:
+      break;
+  }
+
+  is.read(static_cast<char *>(memory), memory_size * type_size);
+  is.close();
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InitMemory() {
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        auto tensor = var->template GetMutable<framework::LoDTensor>();
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+        LoadMemory(*var_desc, tensor,
+                   program_.model_path + "/" + var_desc->Name());
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
+
+          tensor->template mutable_data<Ptype>();
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
+  framework::Variable *g_feed_value = program_.scope->Var("feed");
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(block_id);
+  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+    auto op = ops_of_block_[*to_predict_block.get()][j];
+    op->Run();
+  }
+}
+
+template <typename Dtype, Precision P>
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
+    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
+  framework::Tensor tensor(input, framework::make_ddim(dims));
+
+  predict(tensor, 0);
+
+  framework::Variable *g_feed_value = program_.scope->Var("col");
+  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
+
+  return {};
+}
+
+template class Executor<CPU, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/common/io.h b/src/common/io.h
new file mode 100644
index 00000000000..678441a9e05
--- /dev/null
+++ b/src/common/io.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory.h>
+#include <string>
+#include <vector>
+
+#include "common/types.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/paddle_mobile_object.h"
+#include "framework/program/program.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P = Precision::FP32>
+class Loader : PaddleMobileObject {
+ public:
+  const framework::Program<Dtype, P> Load(const std::string &dirname);
+
+ private:
+  void LoadVar(framework::Variable *variable,
+               const framework::VarDesc &var_desc,
+               const std::string &file_path);
+};
+
+template <typename Dtype, Precision P = Precision::FP32>
+class Executor {
+ public:
+  typedef typename PrecisionTrait<P>::ptype Ptype;
+
+  Executor() = default;
+
+  Executor(const framework::Program<Dtype> p);
+
+  Executor(const framework::Program<Dtype> p, int batch_size);
+
+  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
+
+  std::vector<Ptype> predict(const std::vector<Ptype> &input,
+                             const std::vector<int64_t> &dims);
+
+ protected:
+  void InitMemory();
+  void LoadMemory(const framework::VarDesc var_desc,
+                  framework::LoDTensor *tensor, const std::string &file_path);
+  framework::Program<Dtype> program_;
+  int batch_size_ = 1;
+  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
+  void predict(const framework::Tensor &t, int block_id);
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+};
+
+}  // namespace paddle_mobile

From df6475ee7ac1f8e858b2f0c9bc896635c22ab7af Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 13:08:39 +0800
Subject: [PATCH 10/26] commit io files

---
 src/common/io.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/io.cpp b/src/common/io.cpp
index 4f9309aa108..fc1466237e9 100644
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@@ -176,11 +176,11 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 
   c_program = paddle_mobile__framework__proto__program_desc__unpack(
       NULL, read_size, buf);
-//
+  //
   PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-//
+  //
   DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-//
+  //
   std::shared_ptr<framework::ProgramDesc> originProgramDesc =
       std::make_shared<framework::ProgramDesc>(c_program);
 

From 5f3f414f84ba2be8d1fb1fbf70ce975532f0f858 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 14:29:07 +0800
Subject: [PATCH 11/26] Init tensor memory in executor_for_test

---
 src/framework/operator.cpp | 1 +
 test/executor_for_test.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index dfdf0af79ac..808002d4c8f 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -23,6 +23,7 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   auto it = op_input_output_key.find(type_);
   if (it == op_input_output_key.end()) {
     DLOG << type_ << " has no outputs";
+    return {};
   }
   return it->second.second;
 }
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index c69eba222fb..48b6b5cf3c6 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -73,6 +73,7 @@ class Executor4Test : public Executor<DeviceType> {
         }
       }
     }
+    this->InitMemory();
   }
 
   template <typename T = LoDTensor>

From 29c19e4411befa52dd4c303a21c49aa52749a372 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Wed, 30 May 2018 15:30:45 +0800
Subject: [PATCH 12/26] add split, foramt codes

---
 src/common/types.h                            | 41 +++++++++++
 src/framework/operator.h                      | 69 +++++++++++--------
 .../program/program-optimize/node.cpp         | 66 ++++++++++++++++--
 src/framework/program/program-optimize/node.h |  4 +-
 .../program-optimize/program_optimize.cpp     |  4 +-
 .../program-optimize/program_optimize.h       |  1 -
 src/{common => }/io.cpp                       | 60 ++++++----------
 src/{common => }/io.h                         | 12 ++--
 src/operators/fusion_conv_add_relu_op.h       | 11 ++-
 src/operators/fusion_fc_op.h                  |  8 +--
 src/operators/kernel/arm/relu_kernel.cpp      |  4 ++
 src/operators/op_param.h                      |  4 +-
 src/operators/relu_op.cpp                     |  4 ++
 src/operators/relu_op.h                       | 10 +++
 test/executor_for_test.h                      |  6 +-
 test/framework/test_load.cpp                  |  7 +-
 test/framework/test_optimize.cpp              |  7 +-
 test/net/test_googlenet.cpp                   |  6 +-
 test/operators/test_batchnorm_op.cpp          |  2 +-
 test/operators/test_box_coder_op.cpp          |  2 +-
 test/operators/test_concat_op.cpp             |  2 +-
 test/operators/test_cov_op.cpp                |  2 +-
 test/operators/test_elementwise_add_op.cpp    |  2 +-
 test/operators/test_fushion_fc_op.cpp         |  2 +-
 test/operators/test_lrn_op.cpp                |  2 +-
 test/operators/test_mul_op.cpp                |  2 +-
 test/operators/test_pool_op.cpp               |  6 +-
 test/operators/test_prior_box_op.cpp          |  2 +-
 test/operators/test_relu_op.cpp               |  2 +-
 test/operators/test_reshape_op.cpp            |  6 +-
 test/operators/test_sigmoid_op.cpp            |  2 +-
 test/operators/test_softmax_op.cpp            |  6 +-
 test/operators/test_transpose_op.cpp          |  6 +-
 test/test_helper.h                            |  2 +-
 test/test_include.h                           |  2 +-
 35 files changed, 242 insertions(+), 132 deletions(-)
 rename src/{common => }/io.cpp (91%)
 rename src/{common => }/io.h (86%)

diff --git a/src/common/types.h b/src/common/types.h
index ae76c953aa5..252c747d75f 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once;
 
+#include <string>
+#include <utility>
+#include <unordered_map>
+
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
 
@@ -67,4 +71,41 @@ enum PMStatus {
   PMUnImplError = 0x07,    /*!< Unimplement error. */
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
+
+static const std::string G_OP_TYPE_CONV = "conv2d";
+static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+static const std::string G_OP_TYPE_CONCAT = "concat";
+static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "FusionConvAddRelu";
+static const std::string G_OP_TYPE_FC = "fc";
+static const std::string G_OP_TYPE_LRN = "lrn";
+static const std::string G_OP_TYPE_MUL = "mul";
+static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+static const std::string G_OP_TYPE_POOL2D = "pool2d";
+static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+static const std::string G_OP_TYPE_RELU = "relu";
+static const std::string G_OP_TYPE_RESHAPE = "reshape";
+static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+static const std::string G_OP_TYPE_SOFTMAX = "softmax";
+static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+static const std::string G_OP_TYPE_SPLIT = "split";
+static const std::string G_OP_TYPE_FEED = "feed";
+static const std::string G_OP_TYPE_FETCH = "fetch";
+
+static std::unordered_map<
+        std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+        op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+                               {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+                               {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+                               {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+                               {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}};
+
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.h b/src/framework/operator.h
index a44d264a188..e9dc6f6fb75 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -19,61 +19,64 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "common/enforce.h"
-#include "common/type_define.h"
 #include "common/types.h"
+#include "common/enforce.h"
 #include "common/variant.h"
-#include "framework/attribute.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
 #include "framework/op_info.h"
-#include "framework/op_kernel_type.h"
+#include "common/type_define.h"
+#include "framework/variable.h"
+#include "framework/attribute.h"
 #include "framework/op_registry.h"
-#include "framework/paddle_mobile_object.h"
+#include "framework/op_kernel_type.h"
 #include "framework/program/block_desc.h"
+#include "framework/paddle_mobile_object.h"
 #include "framework/program/program-optimize/node.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
 
 namespace paddle_mobile {
 namespace framework {
 using std::string;
 using std::vector;
-static std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{"conv2d", {{"Input"}, {"Output"}}},
-                           {"relu", {{"X"}, {"Out"}}},
-                           {"softmax", {{"X"}, {"Out"}}},
-                           {"mul", {{"X"}, {"Out"}}},
-                           {"elementwise_add", {{"X", "Y"}, {"Out"}}},
-                           {"pool2d", {{"X"}, {"Out"}}},
-                           {"batch_norm", {{"X"}, {"Y"}}},
-                           {"lrn", {{"X"}, {"Out"}}},
-                           {"concat", {{"X"}, {"Out"}}},
-                           {"feed", {{"X"}, {"Out"}}},
-                           {"fetch", {{"X"}, {"Out"}}}};
-
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
  public:
+  /*
+   *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
+   * */
   OperatorBase(const std::string &type, const VariableNameMap &inputs,
                const VariableNameMap &outputs, const AttributeMap &attrs,
                std::shared_ptr<Scope> scope);
   virtual ~OperatorBase() {}
   void Run() const;
-  vector<string> GetOutKeys() const;
+  std::vector<string> GetOutKeys() const;
   virtual void RunImpl() const = 0;
-  virtual void InferShape() const = 0;
 
+  /*
+   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
+   * */
   const VariableNameMap &Inputs() const { return inputs_; }
+  /*
+   * @b op 的输出, 内存会提前被分配好, 运算结果会被存到分配好的内存内
+   * */
   const VariableNameMap &Outputs() const { return outputs_; }
+  /*
+   * @b op 类型
+   * */
   const std::string &Type() const { return type_; }
+  /*
+   * @b op 运算所需要用到的参数: 如 conv 运算所需要用到的 stride
+   * */
   const AttributeMap &Attrs() const { return attrs_; }
   void ClearVariables(const std::vector<std::string> &var_names) const {
     if (this->scope_) {
       this->scope_->EraseVars(var_names);
     }
   }
-
+  /*
+   * @b 根据输入形状和参数计算出输出形状
+   * */
+  virtual void InferShape() const = 0;
  protected:
   std::shared_ptr<Scope> scope_;
   std::string type_;
@@ -85,6 +88,9 @@ class OperatorBase : PaddleMobileObject {
   void CheckAllInputOutputSet() const;
 };
 
+/*
+ * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
+ * */
 template <typename Dtype>
 class OperatorWithKernel : public OperatorBase<Dtype> {
  public:
@@ -97,11 +103,18 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
   virtual void InferShape() const = 0;
 };
 
+/*
+ * @b 所有kernel的父类
+ * */
 template <typename Dtype, typename P>
 class OpKernelBase : PaddleMobileObject {
  public:
+  /*
+   * @b 所有kernel 需实现 Compute 方法
+   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+   * */
   virtual void Compute(const P &para) const = 0;
-
   virtual ~OpKernelBase() = default;
 };
 
@@ -118,8 +131,8 @@ class FusionOpMatcher : PaddleMobileObject {
 
   virtual std::string Type() = 0;
 
-  virtual void FolderNodes(Node &node) {
-    node.Folder(node_.Depth(), Type(), {});
+  virtual void FolderNodes(Node *node) {
+    node->Folder(node_.Depth(), Type(), {});
   }
 
   virtual Node &BeginNode() { return node_; }
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index ac7137a47f3..f260fd0b61f 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <sstream>
 
+#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"
 
 namespace paddle_mobile {
@@ -73,24 +74,79 @@ void Node::OpDescs(uint index,
 }
 
 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node) {
-  auto iter = std::find(op_desc->begin(), op_desc->end(), this->op_desc_);
+                   Node *node, bool adding_thread, int thread_num) {
+  bool can_add_split = false;
+  if (outputs_.size() > 1) {
+    can_add_split = true;
+    if (op_input_output_key[op_desc_->type_].second.size() != 1) {
+      DLOG << "当前 op desc 输出数不为 1 ";
+      can_add_split = false;
+    }
+    for (const auto& output : outputs_) {
+      if (op_input_output_key.find(output->op_desc_->type_) != op_input_output_key.end()) {
+        auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
+        auto outputs_of_output = output->op_desc_->Output(inputs_and_outputs.second[0]);
+        auto inputs_of_output = output->op_desc_->Input(inputs_and_outputs.first[0]);
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
+
   if (inputs_.size() > 1 && node != inputs_.back()) {
     return;
   } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    adding_thread = false;
     op_desc->push_back(this->op_desc_);
   } else {
     op_desc->push_back(this->op_desc_);
   }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    this->op_desc_->attrs_["thread"] = attr;
+  }
 
-  for (auto &output : outputs_) {
-    output->OpDescs(op_desc, this);
+  if (can_add_split) {
+    adding_thread = true;
+    std::shared_ptr<class OpDesc> split_op_desc = std::make_shared<class OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = this->op_desc_->Output(op_input_output_key[this->op_desc_->Type()].second[0]);
+
+    split_op_desc->inputs_ = {{op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs = split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto& output : outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+    DLOG << "add split";
+    op_desc->push_back(split_op_desc);
+  }
+
+  for (int i = 0; i < outputs_.size(); ++i) {
+    auto &output = outputs_[i];
+    if (can_add_split) {
+      output->OpDescs(op_desc, this, adding_thread, i);
+    } else {
+      output->OpDescs(op_desc, this, adding_thread, thread_num);
+    }
   }
 }
 
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
   std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this);
+  OpDescs(&op_descs, this, false, 0);
   return op_descs;
 }
 
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index da9a7ef5694..5dd1a3acbf5 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -42,13 +42,13 @@ class Node : PaddleMobileObject {
       std::map<std::string, std::pair<std::string, std::string>> change_map);
   std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
   std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node);
   std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
   std::string BeginType() { return type_; }
   void Description();
 
  private:
+  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+               Node *node, bool adding_thread, int thread_num);
   void OpDescs(uint size,
                std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
   void To(int index, std::shared_ptr<Node>);
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index fd7edeed1b6..cd6899efe36 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,7 +19,7 @@ namespace paddle_mobile {
 
 namespace framework {
 
-std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
+//std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
 
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     std::shared_ptr<ProgramDesc> ori_des) {
@@ -86,7 +86,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
           //          DLOG << " match success " << " fusion node: \n" <<
           //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
           //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(*match_node);
+          matcher->FolderNodes(match_node.get());
           //          DLOG << " after match node\n"<< *match_node;
           //          match_node->Description();
 
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 9dc4b19eba3..3839fa1e36b 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,7 +27,6 @@ namespace framework {
 class ProgramOptimize {
  public:
   ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> Optimize();
   std::shared_ptr<ProgramDesc> FushionOptimize(
       std::shared_ptr<ProgramDesc> ori_des);
 
diff --git a/src/common/io.cpp b/src/io.cpp
similarity index 91%
rename from src/common/io.cpp
rename to src/io.cpp
index fc1466237e9..23b3e21ee81 100644
--- a/src/common/io.cpp
+++ b/src/io.cpp
@@ -15,15 +15,18 @@ limitations under the License. */
 #include "io.h"
 #include <fstream>
 #include <vector>
-#include "common/enforce.h"
 #include "common/log.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
+
+#include "common/enforce.h"
+#include "common/enforce.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#include "framework/operator.h"
+#include "framework/lod_tensor.h"
+#include "framework/framework.pb-c.h"
+#include "framework/program/var_desc.h"
+#include "framework/program/program_desc.h"
+#include "framework/program/program-optimize/program_optimize.h"
 
 namespace paddle_mobile {
 using framework::Variable;
@@ -166,7 +169,7 @@ void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
 
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname) {
+    const std::string &dirname, bool optimize) {
   std::string model_filename = dirname + "/__model__";
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
   uint8_t *buf = NULL;
@@ -199,11 +202,11 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       //      DLOG << "var name-- " << var_desc->Name();
       auto var = scope->Var(var_desc->Name());
 
+
       if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
         if (var_desc->Persistable() &&
             var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
             var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          //          DLOG << "to load var ";
           auto dim = var_desc->Tensor_desc().Dims();
           auto tensor = var->GetMutable<framework::LoDTensor>();
           tensor->Resize(framework::make_ddim(dim));
@@ -219,8 +222,12 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       }
     }
   }
+  //  originProgramDesc->Description("program: ");
 
-  originProgramDesc->Description("program: ");
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram = program_optimize.FushionOptimize(originProgramDesc);
+  }
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
@@ -231,33 +238,8 @@ template class Loader<CPU, Precision::FP32>;
 #pragma mark - executor
 
 template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p) : program_(p) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-    }
-  }
-  InitMemory();
-}
-
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size)
-    : program_(p), batch_size_(batch_size) {
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, bool use_optimize)
+    : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
   if (use_optimize_) {
     to_predict_program_ = program_.optimizeProgram;
   } else {
@@ -389,7 +371,7 @@ void Executor<Dtype, P>::InitMemory() {
 }
 
 template <typename Dtype, Precision P>
-void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
+void Executor<Dtype, P>::Predict(const framework::Tensor &t, int block_id) {
   framework::Variable *g_feed_value = program_.scope->Var("feed");
   framework::Tensor *feed_tensor =
       g_feed_value->GetMutable<framework::LoDTensor>();
@@ -404,11 +386,11 @@ void Executor<Dtype, P>::predict(const framework::Tensor &t, int block_id) {
 }
 
 template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::predict(
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
   framework::Tensor tensor(input, framework::make_ddim(dims));
 
-  predict(tensor, 0);
+  Predict(tensor, 0);
 
   framework::Variable *g_feed_value = program_.scope->Var("col");
   auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
diff --git a/src/common/io.h b/src/io.h
similarity index 86%
rename from src/common/io.h
rename to src/io.h
index 678441a9e05..8a73beba6d8 100644
--- a/src/common/io.h
+++ b/src/io.h
@@ -30,7 +30,7 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P = Precision::FP32>
 class Loader : PaddleMobileObject {
  public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname);
+  const framework::Program<Dtype, P> Load(const std::string &dirname, bool optimize = true);
 
  private:
   void LoadVar(framework::Variable *variable,
@@ -45,13 +45,11 @@ class Executor {
 
   Executor() = default;
 
-  Executor(const framework::Program<Dtype> p);
+  Executor(const framework::Program<Dtype> p, int batch_size = 1, bool use_optimize = true);
 
-  Executor(const framework::Program<Dtype> p, int batch_size);
+  //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
 
-  std::shared_ptr<framework::Tensor> predict(framework::Tensor &t);
-
-  std::vector<Ptype> predict(const std::vector<Ptype> &input,
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
  protected:
@@ -61,7 +59,7 @@ class Executor {
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void predict(const framework::Tensor &t, int block_id);
+  void Predict(const framework::Tensor &t, int block_id);
   std::map<framework::BlockDesc,
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 39f11dd708c..cab55dc3617 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -23,18 +23,17 @@ namespace operators {
 class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  public:
   FushionConvAddReluOpMatcher() {
-    node_ = framework::Node("conv2d");
-    node_ > std::make_shared<framework::Node>("elementwise_add") >
-        std::make_shared<framework::Node>("relu");
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
   void FolderNodes(framework::Node &node) {
     std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
-
-  std::string Type() { return "FusionConvAddRelu"; }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
 
 class FusionFcOp {
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 0ed5a2b4d5e..fd6f2658fd1 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -28,17 +28,17 @@ using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
  public:
   FusionFcMatcher() {
-    node_ = framework::Node("mul");
-    node_ > std::make_shared<framework::Node>("elementwise_add");
+    node_ = framework::Node(G_OP_TYPE_MUL);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
   }
 
   void FolderNodes(framework::Node &node) {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{"elementwise_add", {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
 
-  std::string Type() { return "fc"; }
+  std::string Type() { return G_OP_TYPE_FC; }
 };
 
 template <typename DeviceType, typename T>
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index e0badea51e7..96fcb7c3088 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -20,11 +20,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
+
 template <typename T>
 struct ReluFunctor {
   inline T operator()(T in) const { return in > 0 ? in : 0; }
 };
 
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
   const auto *input_x = param.InputX();
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 02bda7147aa..0ce187c0849 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -696,6 +696,9 @@ class ReshapeParam : public OpParam {
   bool inplace_;
 };
 
+/*
+ * @b op 层实例化好这个 param 传递给 kernel 层使用
+ * */
 class ReluParam : public OpParam {
  public:
   ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -725,7 +728,6 @@ class FushionFcParam : public OpParam {
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
     axis_ = GetAttr<int>("axis", attrs);
   }
-
   const Tensor *InputX() const { return input_x_; }
 
   const Tensor *InputY() const { return input_y_; }
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 5f861579ab4..35791b28845 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -25,6 +25,10 @@ template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数 都是需要和model中类型对应起来的
+ * */
 namespace ops = paddle_mobile::operators;
 USE_OP(relu);
 REGISTER_OPERATOR(relu, ops::ReluOp);
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 6c3a614a1a0..aed907e0f87 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -28,6 +28,9 @@ using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
 class ReluOp : public framework::OperatorWithKernel<DeviceType> {
  public:
+  /*
+   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
+   * */
   ReluOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
@@ -35,6 +38,9 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
+   /*
+   * @b op 进行运算, 调用相应的 kernel 进行运算
+   * */
   void RunImpl() const {
     operators::ReluKernel<DeviceType, T> kernel;
     kernel.Compute(param_);
@@ -44,6 +50,10 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
   void InferShape() const override;
 
  protected:
+  /*
+   * @b Relu kernel 进行运算时所需要用到参数的结构体,
+   *    结构体定义在: paddle-mobile/src/operators/op_param.h
+   * */
   ReluParam param_;
 };
 
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 48b6b5cf3c6..2893eccd80e 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "common/io.h"
+#include "io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
@@ -77,7 +77,7 @@ class Executor4Test : public Executor<DeviceType> {
   }
 
   template <typename T = LoDTensor>
-  vector<std::shared_ptr<Tensor>> predict(const vector<Tensor> &ts,
+  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
                                           const vector<string> &input_names,
                                           const vector<string> &output_names,
                                           const vector<DDim> &ddims) {
@@ -116,7 +116,7 @@ class Executor4Test : public Executor<DeviceType> {
     return output_tensor_sptrs;
   }
 
-  std::shared_ptr<Tensor> predict(const Tensor &t, string input, string output,
+  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
                                   const DDim &dDim) {
     auto scope = this->program_.scope;
     Variable *g_feed_value = scope->Var(input);
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index fe403b55a18..19871f95557 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "common/io.h"
+#include "io.h"
+#include "../test_helper.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet);
+  program.optimizeProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 4c4dc6eb3ee..b371fb63b93 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "common/io.h"
+#include "io.h"
+#include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
   paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
+//  program.originProgram->Description("origin");
   auto optimize_program = optimize.FushionOptimize(program.originProgram);
   if (optimize_program != nullptr) {
     optimize_program->Description("optimize");
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index d52f080277a..139579e9116 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -21,16 +21,16 @@ int main() {
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(g_googlenet, false);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
 
   std::vector<float> input;
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);
   auto time3 = time();
-  executor.predict(input, dims);
+  executor.Predict(input, dims);
   auto time4 = time();
   DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index ba2e06b80b4..0acd6ea5726 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -129,7 +129,7 @@ int main() {
   DLOG << "begin to run BatchNormOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(
-      "../../test/models/image_classification_resnet.inference.model"));
+          g_resnet));
 
   /// input x (4,10,2,2)
   paddle_mobile::framework::Tensor inputx1;
diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp
index b7695c91dfb..dac0d0b8051 100644
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -116,7 +116,7 @@ int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run BoxCoderOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   paddle_mobile::framework::Tensor priorbox;
   SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),
diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp
index a9bb072f1e9..7a106b03c44 100644
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -57,7 +57,7 @@ int main() {
   auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
   out_ddims.push_back(out_ddim);
 
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                             output_names, out_ddims);
 
   auto output0_data = output[0]->data<float>();
diff --git a/test/operators/test_cov_op.cpp b/test/operators/test_cov_op.cpp
index 2fe7f3577be..ba6a9b4800f 100644
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -34,7 +34,7 @@ int main() {
   //                     static_cast<float>(1));
 
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.predict(input, "data", "conv2d_0.tmp_0", out_ddim);
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
 
   auto output_ptr = output->data<float>();
   for (int j = 0; j < output->numel(); ++j) {
diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp
index 1b4bf457a2c..c4997f2eb37 100644
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -50,7 +50,7 @@ int main() {
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
   out_ddims.push_back(out_ddim);
 
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                             output_names, out_ddims);
 
   auto output0_data = output[0]->data<float>();
diff --git a/test/operators/test_fushion_fc_op.cpp b/test/operators/test_fushion_fc_op.cpp
index 6063772d85a..8dc1b02bec4 100644
--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -116,7 +116,7 @@ int main() {
   DLOG << "begin to run Fc Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
-  auto program = loader.Load("../models/googlenet");
+  auto program = loader.Load(g_googlenet);
   paddle_mobile::framework::ProgramOptimize optimize;
   //  program.originProgram->Description("origin");
   auto optimize_program = optimize.FushionOptimize(program.originProgram);
diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp
index ba35639fb71..cf5fd4bdf2d 100644
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -46,7 +46,7 @@ int main() {
   auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
   out_ddims.push_back(out_ddim);
 
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                             output_names, out_ddims);
 
   auto output0_data = output[0]->data<float>();
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 8acd4a99470..5412e6905b7 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -50,7 +50,7 @@ int main() {
   auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
   out_ddims.push_back(out_ddim);
 
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                             output_names, out_ddims);
 
   auto output0_data = output[0]->data<float>();
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index 8a1c0a7ccec..62dfc20dc12 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/googlenet"));
+  auto program = loader.Load(std::string(g_googlenet));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
   }
@@ -32,7 +32,7 @@ int main() {
                      static_cast<float>(1));
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
   auto output =
-      executor.predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
 
   float *output_ptr = output->data<float>();
   for (int j = 0; j < output->numel(); ++j) {
diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp
index 80ede944936..8c697a9a798 100644
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -127,7 +127,7 @@ int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run PriorBoxOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   /// input x (1,3,300,300)
   paddle_mobile::framework::Tensor input_image;
diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp
index fb68b921113..50f3b6a20b6 100644
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -46,7 +46,7 @@ int main() {
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
   out_ddims.push_back(out_ddim);
 
-  auto output = executor.predict<LoDTensor>(input_tensors, input_names,
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
                                             output_names, out_ddims);
 
   auto output0_data = output[0]->data<float>();
diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp
index b0251e693a7..5448aac87c2 100644
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
   }
@@ -31,7 +31,7 @@ int main() {
   auto input_ptr = input.data<float>();
   auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
   auto output =
-      executor.predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
   auto *output_ptr = output->data<float>();
 
   DLOG << "input : ";
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index dcd35cd8e46..289eac149fa 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index 094c48adbb6..58de5300cca 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../models/mobilenet"));
+  auto program = loader.Load(std::string(g_mobilenet));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
   }
@@ -30,7 +30,7 @@ int main() {
                      static_cast<float>(1));
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
   auto output =
-      executor.predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
+      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
   auto *output_ptr = output->data<float>();
   for (int j = 0; j < output->numel(); ++j) {
     DLOG << " value of output: " << output_ptr[j];
diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp
index 23e3bc3ec47..4c88df2d83d 100644
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "common/io.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
   }
@@ -31,7 +31,7 @@ int main() {
   auto input_ptr = input.data<float>();
   auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
   auto output =
-      executor.predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
+      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
   auto *output_ptr = output->data<float>();
 
   DLOG << "input : ";
diff --git a/test/test_helper.h b/test/test_helper.h
index dba4dec9bbc..0fec49e4e92 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <chrono>
-#include <fstream>
 #include <random>
+#include <fstream>
 
 #include "common/log.h"
 #include "framework/ddim.h"
diff --git a/test/test_include.h b/test/test_include.h
index 19a9bff8846..0046bdb4e41 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "./test_helper.h"
 #include "common/enforce.h"
-#include "common/io.h"
+#include "io.h"
 #include "common/log.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"

From 50da2d3114a6a7af9468aad3c6bf6995c04e079c Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Wed, 30 May 2018 15:31:27 +0800
Subject: [PATCH 13/26] format files

---
 src/common/types.h                            | 28 +++++++++----------
 src/framework/operator.h                      | 17 +++++------
 .../program/program-optimize/node.cpp         | 27 +++++++++++-------
 .../program-optimize/program_optimize.cpp     |  2 +-
 src/io.cpp                                    | 20 ++++++-------
 src/io.h                                      |  6 ++--
 src/operators/fusion_conv_add_relu_op.h       |  3 +-
 src/operators/fusion_fc_op.h                  |  3 +-
 src/operators/kernel/arm/relu_kernel.cpp      |  1 -
 src/operators/relu_op.cpp                     |  3 +-
 src/operators/relu_op.h                       |  2 +-
 test/executor_for_test.h                      |  2 +-
 test/framework/test_load.cpp                  |  2 +-
 test/framework/test_optimize.cpp              |  4 +--
 test/operators/test_batchnorm_op.cpp          |  3 +-
 test/test_helper.h                            |  2 +-
 test/test_include.h                           |  2 +-
 17 files changed, 69 insertions(+), 58 deletions(-)

diff --git a/src/common/types.h b/src/common/types.h
index 252c747d75f..b25ae329931 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once;
 
 #include <string>
-#include <utility>
 #include <unordered_map>
+#include <utility>
 
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -94,18 +94,18 @@ static const std::string G_OP_TYPE_FEED = "feed";
 static const std::string G_OP_TYPE_FETCH = "fetch";
 
 static std::unordered_map<
-        std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-        op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-                               {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-                               {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-                               {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-                               {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}};
+    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+    op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+                           {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+                           {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+                           {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+                           {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.h b/src/framework/operator.h
index e9dc6f6fb75..8e5e55fb469 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -19,20 +19,20 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "common/types.h"
 #include "common/enforce.h"
-#include "common/variant.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/op_info.h"
 #include "common/type_define.h"
-#include "framework/variable.h"
+#include "common/types.h"
+#include "common/variant.h"
 #include "framework/attribute.h"
-#include "framework/op_registry.h"
+#include "framework/op_info.h"
 #include "framework/op_kernel_type.h"
-#include "framework/program/block_desc.h"
+#include "framework/op_registry.h"
 #include "framework/paddle_mobile_object.h"
+#include "framework/program/block_desc.h"
 #include "framework/program/program-optimize/node.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+#include "framework/variable.h"
 
 namespace paddle_mobile {
 namespace framework {
@@ -77,6 +77,7 @@ class OperatorBase : PaddleMobileObject {
    * @b 根据输入形状和参数计算出输出形状
    * */
   virtual void InferShape() const = 0;
+
  protected:
   std::shared_ptr<Scope> scope_;
   std::string type_;
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index f260fd0b61f..820fa6a443c 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -82,11 +82,14 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
       DLOG << "当前 op desc 输出数不为 1 ";
       can_add_split = false;
     }
-    for (const auto& output : outputs_) {
-      if (op_input_output_key.find(output->op_desc_->type_) != op_input_output_key.end()) {
+    for (const auto &output : outputs_) {
+      if (op_input_output_key.find(output->op_desc_->type_) !=
+          op_input_output_key.end()) {
         auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
-        auto outputs_of_output = output->op_desc_->Output(inputs_and_outputs.second[0]);
-        auto inputs_of_output = output->op_desc_->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output =
+            output->op_desc_->Output(inputs_and_outputs.second[0]);
+        auto inputs_of_output =
+            output->op_desc_->Input(inputs_and_outputs.first[0]);
         for (int i = 0; i < inputs_of_output.size(); ++i) {
           std::string input_of_output = inputs_of_output[i];
           for (int j = 0; j < outputs_of_output.size(); ++j) {
@@ -121,13 +124,17 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
 
   if (can_add_split) {
     adding_thread = true;
-    std::shared_ptr<class OpDesc> split_op_desc = std::make_shared<class OpDesc>();
+    std::shared_ptr<class OpDesc> split_op_desc =
+        std::make_shared<class OpDesc>();
     split_op_desc->type_ = G_OP_TYPE_SPLIT;
-    auto outputs = this->op_desc_->Output(op_input_output_key[this->op_desc_->Type()].second[0]);
-
-    split_op_desc->inputs_ = {{op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
-    auto &split_outputs = split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
-    for (const auto& output : outputs_) {
+    auto outputs = this->op_desc_->Output(
+        op_input_output_key[this->op_desc_->Type()].second[0]);
+
+    split_op_desc->inputs_ = {
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : outputs_) {
       split_outputs.push_back(outputs[0]);
     }
     DLOG << "add split";
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index cd6899efe36..737fed9bd56 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,7 +19,7 @@ namespace paddle_mobile {
 
 namespace framework {
 
-//std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
+// std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
 
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     std::shared_ptr<ProgramDesc> ori_des) {
diff --git a/src/io.cpp b/src/io.cpp
index 23b3e21ee81..002e73b7964 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -18,15 +18,14 @@ limitations under the License. */
 #include "common/log.h"
 
 #include "common/enforce.h"
-#include "common/enforce.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/operator.h"
-#include "framework/lod_tensor.h"
 #include "framework/framework.pb-c.h"
-#include "framework/program/var_desc.h"
-#include "framework/program/program_desc.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
 #include "framework/program/program-optimize/program_optimize.h"
+#include "framework/program/program_desc.h"
+#include "framework/program/var_desc.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
 
 namespace paddle_mobile {
 using framework::Variable;
@@ -202,7 +201,6 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       //      DLOG << "var name-- " << var_desc->Name();
       auto var = scope->Var(var_desc->Name());
 
-
       if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
         if (var_desc->Persistable() &&
             var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
@@ -226,7 +224,8 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 
   if (optimize) {
     framework::ProgramOptimize program_optimize;
-    program.optimizeProgram = program_optimize.FushionOptimize(originProgramDesc);
+    program.optimizeProgram =
+        program_optimize.FushionOptimize(originProgramDesc);
   }
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
@@ -238,7 +237,8 @@ template class Loader<CPU, Precision::FP32>;
 #pragma mark - executor
 
 template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, bool use_optimize)
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
+                             bool use_optimize)
     : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
   if (use_optimize_) {
     to_predict_program_ = program_.optimizeProgram;
diff --git a/src/io.h b/src/io.h
index 8a73beba6d8..de2d359bf58 100644
--- a/src/io.h
+++ b/src/io.h
@@ -30,7 +30,8 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P = Precision::FP32>
 class Loader : PaddleMobileObject {
  public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname, bool optimize = true);
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = true);
 
  private:
   void LoadVar(framework::Variable *variable,
@@ -45,7 +46,8 @@ class Executor {
 
   Executor() = default;
 
-  Executor(const framework::Program<Dtype> p, int batch_size = 1, bool use_optimize = true);
+  Executor(const framework::Program<Dtype> p, int batch_size = 1,
+           bool use_optimize = true);
 
   //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
 
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index cab55dc3617..1fa3399cf22 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -31,7 +31,8 @@ class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
   void FolderNodes(framework::Node &node) {
     std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(),
+                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index fd6f2658fd1..fb49fa61b20 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -35,7 +35,8 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
   void FolderNodes(framework::Node &node) {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+    node.Folder(node_.Depth(), Type(),
+                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
 
   std::string Type() { return G_OP_TYPE_FC; }
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 96fcb7c3088..586d9811751 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-
 template <typename T>
 struct ReluFunctor {
   inline T operator()(T in) const { return in > 0 ? in : 0; }
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 35791b28845..21bcc605282 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -27,7 +27,8 @@ template class ReluOp<CPU, float>;
 
 /*
  * @b 每一个 op 都需要注册一下的,
- *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数 都是需要和model中类型对应起来的
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
  * */
 namespace ops = paddle_mobile::operators;
 USE_OP(relu);
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index aed907e0f87..7be8cd249cb 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -38,7 +38,7 @@ class ReluOp : public framework::OperatorWithKernel<DeviceType> {
                                                   scope),
         param_(inputs, outputs, attrs, *scope) {}
 
-   /*
+  /*
    * @b op 进行运算, 调用相应的 kernel 进行运算
    * */
   void RunImpl() const {
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 2893eccd80e..ce3c84e986e 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "io.h"
 #include "common/log.h"
 #include "framework/op_registry.h"
+#include "io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 19871f95557..95357547e1b 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io.h"
 #include "../test_helper.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index b371fb63b93..f0392cfec02 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io.h"
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
+#include "io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
   auto program = loader.Load(g_googlenet);
   paddle_mobile::framework::ProgramOptimize optimize;
-//  program.originProgram->Description("origin");
+  //  program.originProgram->Description("origin");
   auto optimize_program = optimize.FushionOptimize(program.originProgram);
   if (optimize_program != nullptr) {
     optimize_program->Description("optimize");
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index 0acd6ea5726..38d9f624909 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -128,8 +128,7 @@ int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run BatchNormOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(
-          g_resnet));
+  auto program = loader.Load(std::string(g_resnet));
 
   /// input x (4,10,2,2)
   paddle_mobile::framework::Tensor inputx1;
diff --git a/test/test_helper.h b/test/test_helper.h
index 0fec49e4e92..dba4dec9bbc 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <chrono>
-#include <random>
 #include <fstream>
+#include <random>
 
 #include "common/log.h"
 #include "framework/ddim.h"
diff --git a/test/test_include.h b/test/test_include.h
index 0046bdb4e41..25efbb9f4c0 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #include "./test_helper.h"
 #include "common/enforce.h"
-#include "io.h"
 #include "common/log.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -30,3 +29,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#include "io.h"

From 8d3c8d674c446d66f5539814a17d5aabc1ea72b0 Mon Sep 17 00:00:00 2001
From: zhaojiaying01 <zhaojiaying01@baidu.com>
Date: Wed, 30 May 2018 16:26:32 +0800
Subject: [PATCH 14/26] submit depthwise_conv_op and test

---
 src/operators/conv_op.cpp                     |   7 -
 src/operators/conv_op.h                       |   7 +
 src/operators/depthwise_conv_op.cpp           |  57 ++++++++
 src/operators/depthwise_conv_op.h             |  49 +++++++
 src/operators/kernel/arm/conv_kernel.cpp      |  13 --
 .../kernel/arm/depthwise_conv_kernel.cpp      | 126 ++++++++++++++++++
 src/operators/kernel/conv_kernel.h            |  21 ++-
 src/operators/kernel/depthwise_conv_kernel.h  |  34 +++++
 test/CMakeLists.txt                           |   4 +
 test/operators/test_depthwise_conv_op.cpp     |  46 +++++++
 10 files changed, 342 insertions(+), 22 deletions(-)
 create mode 100644 src/operators/depthwise_conv_op.cpp
 create mode 100644 src/operators/depthwise_conv_op.h
 create mode 100644 src/operators/kernel/arm/depthwise_conv_kernel.cpp
 create mode 100644 src/operators/kernel/depthwise_conv_kernel.h
 create mode 100644 test/operators/test_depthwise_conv_op.cpp

diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 148b0f69f96..bfddcf14acb 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -21,13 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
   //  std::cout << " begin get dims: " << std::endl;
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index 1557f2f06ee..f15f286b606 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -44,5 +44,12 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
   ConvParam param_;
 };
 
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
new file mode 100644
index 00000000000..2538298175c
--- /dev/null
+++ b/src/operators/depthwise_conv_op.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/depthwise_conv_op.h"
+#include <vector>
+#include "framework/data_type.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
+#include "operators/conv_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void DepthwiseConvOp<Dtype, T>::InferShape() const {
+  auto in_dims = param_.Input()->dims();
+  auto filter_dims = param_.Filter()->dims();
+  const std::vector<int> &strides = param_.Strides();
+  std::vector<int> paddings = param_.Paddings();
+  int groups = param_.Groups();
+  std::vector<int> dilations = param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  param_.Output()->Resize(ddim);
+}
+
+template class DepthwiseConvOp<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+USE_OP(depthwise_conv2d);
+REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
new file mode 100644
index 00000000000..c47fa0ffcac
--- /dev/null
+++ b/src/operators/depthwise_conv_op.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/depthwise_conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+ public:
+  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+                                                  scope),
+        param_(inputs, outputs, attrs, *scope) {}
+
+  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  void InferShape() const override;
+
+  void RunImpl() const {
+    operators::DepthwiseConvKernel<DeviceType, T> kernel;
+    kernel.Compute(param_);
+    this->ClearVariables({"Filter", "Input"});
+  }
+
+ private:
+  ConvParam param_;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index 1e2572b9847..f04b8156c9d 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -17,19 +17,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   LOG(kLOG_DEBUG) << param;
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
new file mode 100644
index 00000000000..73aa9953cfc
--- /dev/null
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  LOG(kLOG_DEBUG) << param;
+
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  DLOG << " compute end get Attrs " << strides[0];
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  DLOG << " col_shape = " << col_shape;
+  DLOG << " col_matrix_shape = " << col_matrix_shape;
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  DLOG << " input_shape = " << input_shape;
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  DLOG << " filter.dims() = " << filter.dims();
+
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    DLOG << " in_batch.dims() = " << in_batch.dims();
+    DLOG << " out_batch.dims() = " << out_batch.dims();
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      DLOG << " out_slice " << out_slice.dims();
+      DLOG << " filter_slice " << filter_slice.dims();
+      DLOG << " col_matrix " << col_matrix.dims();
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+      auto filter_ptr = filter_slice.data<float>();
+    }
+  }
+}
+
+template class DepthwiseConvKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index a756e2d2417..d43a174ffdb 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
@@ -23,12 +24,28 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
+using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
+class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
 };
+
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
new file mode 100644
index 00000000000..43ddfb25cd8
--- /dev/null
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+ public:
+  void Compute(const ConvParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f464c3bd94f..2bb313342e2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -91,3 +91,7 @@ target_link_libraries(test-googlenet paddle-mobile)
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+target_link_libraries(test-depthwise-conv-op paddle-mobile)
diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp
new file mode 100644
index 00000000000..648b4c5db99
--- /dev/null
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/depthwise_conv_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet_ssd);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "depthwise_conv2d");
+
+  paddle_mobile::framework::LoDTensor input;
+  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  // use SetupTensor if not has local input image .
+  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
+  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
+                                 "depthwise_conv2d_0.tmp_0", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}

From 3e0e0705f517e2d4fc6a436d2fea65754d1d586b Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Wed, 30 May 2018 16:53:48 +0800
Subject: [PATCH 15/26] add test yolo and mobilenet

---
 src/framework/operator.h    |  3 ++-
 test/CMakeLists.txt         |  8 ++++++++
 test/net/test_mobilenet.cpp | 39 +++++++++++++++++++++++++++++++++++
 test/net/test_yolo.cpp      | 41 +++++++++++++++++++++++++++++++++++++
 test/test_helper.h          |  1 +
 5 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 test/net/test_mobilenet.cpp
 create mode 100644 test/net/test_yolo.cpp

diff --git a/src/framework/operator.h b/src/framework/operator.h
index a44d264a188..0d617617753 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -50,7 +50,8 @@ static std::unordered_map<
                            {"lrn", {{"X"}, {"Out"}}},
                            {"concat", {{"X"}, {"Out"}}},
                            {"feed", {{"X"}, {"Out"}}},
-                           {"fetch", {{"X"}, {"Out"}}}};
+                           {"fetch", {{"X"}, {"Out"}}},
+                           {"reshape", {{"X"}, {"Out"}}}};
 
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f464c3bd94f..c80d34c22e5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -84,10 +84,18 @@ target_link_libraries(test-gemm paddle-mobile)
 ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
 target_link_libraries(test-enforce paddle-mobile)
 
+# gen test
+ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-yolo paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
 target_link_libraries(test-googlenet paddle-mobile)
 
+# gen test
+ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenet paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
new file mode 100644
index 00000000000..e686ad85be7
--- /dev/null
+++ b/test/net/test_mobilenet.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+
+  std::vector<int64_t> dims{1, 3, 224, 224};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
new file mode 100644
index 00000000000..ab61fb250e3
--- /dev/null
+++ b/test/net/test_yolo.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_yolo);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index dba4dec9bbc..fc4ed6c91dc 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -28,6 +28,7 @@ static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const std::string g_squeezenet = "../models/squeezenet";
 static const std::string g_resnet =
     "../models/image_classification_resnet.inference.model";
+static const std::string g_yolo = "../models/yolo";
 static const std::string g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
 using paddle_mobile::framework::DDim;

From 680868a9dbc1501bf95e0f430ae89841e2759de9 Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Wed, 30 May 2018 17:54:46 +0800
Subject: [PATCH 16/26] add test resnet squeezenet ssd(mobilenet)

---
 src/common/types.h              | 32 ++++++++++++++-----------
 src/io.cpp                      |  2 +-
 test/CMakeLists.txt             | 11 +++++++++
 test/net/test_mobilenet+ssd.cpp | 39 +++++++++++++++++++++++++++++++
 test/net/test_resnet.cpp        | 39 +++++++++++++++++++++++++++++++
 test/net/test_squeezenet.cpp    | 41 +++++++++++++++++++++++++++++++++
 6 files changed, 150 insertions(+), 14 deletions(-)
 create mode 100644 test/net/test_mobilenet+ssd.cpp
 create mode 100644 test/net/test_resnet.cpp
 create mode 100644 test/net/test_squeezenet.cpp

diff --git a/src/common/types.h b/src/common/types.h
index 227151adbbd..ca9e64cc60f 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -95,17 +95,23 @@ static const std::string G_OP_TYPE_FETCH = "fetch";
 
 static std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-                           {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-                           {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-                           {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}};
+    op_input_output_key = {
+        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BOX_CODER,
+         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
+        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/io.cpp b/src/io.cpp
index 002e73b7964..b8350a81118 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -220,7 +220,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       }
     }
   }
-  //  originProgramDesc->Description("program: ");
+  originProgramDesc->Description("program: ");
 
   if (optimize) {
     framework::ProgramOptimize program_optimize;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 37c0de1496b..c71306281e3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -96,6 +96,17 @@ target_link_libraries(test-googlenet paddle-mobile)
 ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
 target_link_libraries(test-mobilenet paddle-mobile)
 
+# gen test
+ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-resnet paddle-mobile)
+# gen test
+ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenetssd paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-squeezenet paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
new file mode 100644
index 00000000000..e9d92e7a51b
--- /dev/null
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet_ssd, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 300, 300};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
new file mode 100644
index 00000000000..55f4c5efef2
--- /dev/null
+++ b/test/net/test_resnet.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_resnet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 32, 32};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
new file mode 100644
index 00000000000..30460018fe8
--- /dev/null
+++ b/test/net/test_squeezenet.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_squeezenet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}

From e8cc4c9279124cef7d2ba6985356bc76d523543a Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Wed, 30 May 2018 19:09:05 +0800
Subject: [PATCH 17/26] add impl for executor'predict

---
 CMakeLists.txt              |  3 ++-
 scripts/push2android.sh     | 14 ++++++++++++++
 src/common/enforce.h        |  6 +++---
 src/framework/operator.cpp  | 14 +-------------
 src/framework/operator.h    | 12 ++++++++++++
 src/io.cpp                  | 34 +++++++++++++++++++++++++---------
 src/io.h                    | 10 ++++++----
 test/net/test_googlenet.cpp |  7 ++-----
 8 files changed, 65 insertions(+), 35 deletions(-)
 create mode 100644 scripts/push2android.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9382f9697e..f60846e98aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-add_definitions(-DPADDLE_MOBILE_DEBUG="true")
+add_definitions(-DPADDLE_MOBILE_DEBUG)
+add_definitions(-DENABLE_EXCEPTION)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(CMAKE_BUILD_TYPE RelWithDebInfo)
diff --git a/scripts/push2android.sh b/scripts/push2android.sh
new file mode 100644
index 00000000000..44b0ee32e99
--- /dev/null
+++ b/scripts/push2android.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../test/models/*"
+EXE_FILE="../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+MODELS_DIR="data/local/tmp/models"
+LIB_PATH="../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+echo "test files sync completed"
+}
+push_fn
diff --git a/src/common/enforce.h b/src/common/enforce.h
index abd6217fbeb..52bda2258a0 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 #include <stdio.h>
 #include <exception>
 #include <sstream>
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 struct PaddleMobileException : public std::exception {
   const std::string exception_prefix = "paddle mobile C++ Exception: \n";
   std::string message;
@@ -64,7 +64,7 @@ struct PaddleMobileException : public std::exception {
   }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ASSERT(stat, ...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
 #endif
 
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 808002d4c8f..46feb97cb87 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,18 +28,6 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   return it->second.second;
 }
 
-template <typename T>
-static T *GetVarValue(const string &key, const VariableNameMap &var_map,
-                      const Scope &scope) {
-  auto var_vec = var_map.at(key);
-  if (!var_vec.empty()) {
-    auto var = scope.FindVar(var_vec[0]);
-    return var->GetMutable<T>();
-  } else {
-    return nullptr;
-  }
-}
-
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -60,7 +48,7 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
+#if (PADDLE_MOBILE_DEBUG)
   vector<string> output_keys = GetOutKeys();
   for (const auto key : output_keys) {
     Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 6194e5dcfff..2de3a953670 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -39,6 +39,18 @@ namespace framework {
 using std::string;
 using std::vector;
 
+template <typename T>
+static T *GetVarValue(const string &key, const VariableNameMap &var_map,
+                      const Scope &scope) {
+  auto var_vec = var_map.at(key);
+  if (!var_vec.empty()) {
+    auto var = scope.FindVar(var_vec[0]);
+    return var->GetMutable<T>();
+  } else {
+    return nullptr;
+  }
+}
+
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
  public:
diff --git a/src/io.cpp b/src/io.cpp
index b8350a81118..ac89106e498 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -371,31 +371,47 @@ void Executor<Dtype, P>::InitMemory() {
 }
 
 template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict(const framework::Tensor &t, int block_id) {
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t) {
   framework::Variable *g_feed_value = program_.scope->Var("feed");
   framework::Tensor *feed_tensor =
       g_feed_value->GetMutable<framework::LoDTensor>();
   feed_tensor->Resize(t.dims());
   feed_tensor->ShareDataWith(t);
   std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(block_id);
+      to_predict_program_->Block(0);
   for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
     auto op = ops_of_block_[*to_predict_block.get()][j];
     op->Run();
   }
+  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
+  auto last_op = ops.rbegin();
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
+  framework::LoDTensor *output_tensor =
+      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
+                                                   *(program_.scope));
+  return std::shared_ptr<framework::Tensor>(output_tensor);
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t, int block_id) {
+  return Predict(t);
 }
 
 template <typename Dtype, Precision P>
 std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
   framework::Tensor tensor(input, framework::make_ddim(dims));
-
-  Predict(tensor, 0);
-
-  framework::Variable *g_feed_value = program_.scope->Var("col");
-  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
-
-  return {};
+  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
+  Executor<Dtype, P>::Ptype *output_ptr =
+      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+  for (int j = 0; j < output_tensor->numel(); ++j) {
+    result_vector.push_back(output_ptr[j]);
+  }
+  return result_vector;
 }
 
 template class Executor<CPU, Precision::FP32>;
diff --git a/src/io.h b/src/io.h
index de2d359bf58..ae99197baa9 100644
--- a/src/io.h
+++ b/src/io.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory.h>
+#include <map>
 #include <string>
 #include <vector>
 
@@ -44,24 +45,25 @@ class Executor {
  public:
   typedef typename PrecisionTrait<P>::ptype Ptype;
 
-  Executor() = default;
-
   Executor(const framework::Program<Dtype> p, int batch_size = 1,
            bool use_optimize = true);
 
-  //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
 
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
  protected:
+  Executor() = default;
+
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
                   framework::LoDTensor *tensor, const std::string &file_path);
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void Predict(const framework::Tensor &t, int block_id);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
+                                             int block_id);
   std::map<framework::BlockDesc,
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 139579e9116..0640af890cf 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -18,20 +18,17 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
   auto time1 = time();
   auto program = loader.Load(g_googlenet, false);
   auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
   paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
-
   std::vector<float> input;
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);
   auto time3 = time();
   executor.Predict(input, dims);
   auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
   return 0;
 }

From e64e51afbee01204c3ef97fec6a723651479cde8 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Thu, 31 May 2018 10:13:24 +0800
Subject: [PATCH 18/26] modify softmax to support input with multi batch

---
 src/operators/math/softmax.cpp |  8 +++++++-
 test/net/test_mobilenet.cpp    | 13 +++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index 6eaeb6e2561..224382eb2b7 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -136,9 +136,15 @@ class SoftmaxFuntor<CPU, T> {
 
  public:
   void operator()(const framework::Tensor *X, framework::Tensor *Y) {
+    const DDim dDim = X->dims();
+    for (int i = 0; i < dDim[0]; ++i) {
+      framework::Tensor sub_X = X->Slice(i, i + 1);
+      framework::Tensor sub_Y = Y->Slice(i, i + 1);
+
 #if __ARM_NEON
-    SoftmaxCacl(X, Y);
+      SoftmaxCacl(&sub_X, &sub_Y);
 #endif
+    }
   }
 };
 
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index b5d925227e4..aee0d456813 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -22,17 +22,22 @@ int main() {
   auto program = loader.Load(g_mobilenet, false);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false);
 
-  std::vector<int64_t> dims{1, 3, 224, 224};
+  std::vector<int64_t> dims{2, 3, 224, 224};
   Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+  SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0),
                      static_cast<float>(1));
 
   std::vector<float> input(input_tensor.data<float>(),
                            input_tensor.data<float>() + input_tensor.numel());
   auto time3 = time();
-  executor.Predict(input, dims);
+  auto vec_result = executor.Predict(input, dims);
+  float  sum = 0;
+  for (const auto item : vec_result) {
+    sum += item;
+  }
+  DLOG << "mobilenet output sum =" << sum;
   auto time4 = time();
   DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;

From c1a55d8c6d4f3811271f50ff5b875406aa754f48 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Thu, 31 May 2018 10:25:04 +0800
Subject: [PATCH 19/26] modify code style

---
 test/net/test_mobilenet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index aee0d456813..7ed9a3566e3 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -33,7 +33,7 @@ int main() {
                            input_tensor.data<float>() + input_tensor.numel());
   auto time3 = time();
   auto vec_result = executor.Predict(input, dims);
-  float  sum = 0;
+  float sum = 0;
   for (const auto item : vec_result) {
     sum += item;
   }

From 6ea5e26ebdf816961973979bdd0b480c4dc52be2 Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Thu, 31 May 2018 10:46:43 +0800
Subject: [PATCH 20/26] remove some annotations

---
 .../kernel/arm/depthwise_conv_kernel.cpp      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index 73aa9953cfc..ff622d13340 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -32,7 +32,7 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   std::vector<int> paddings = param.Paddings();
   std::vector<int> dilations = param.Dilations();
 
-  DLOG << " compute end get Attrs " << strides[0];
+//  DLOG << " compute end get Attrs " << strides[0];
 
   const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -59,17 +59,17 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
-  DLOG << " col_shape = " << col_shape;
-  DLOG << " col_matrix_shape = " << col_matrix_shape;
+//  DLOG << " col_shape = " << col_shape;
+//  DLOG << " col_matrix_shape = " << col_matrix_shape;
 
   framework::DDim input_shape = framework::slice_ddim(
       input->dims(), 1, static_cast<int>(input->dims().size()));
-  DLOG << " input_shape = " << input_shape;
+//  DLOG << " input_shape = " << input_shape;
 
   framework::DDim filter_matrix_shape = {filter.dims()[0],
                                          filter.numel() / filter.dims()[0]};
   filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
+//  DLOG << " filter.dims() = " << filter.dims();
 
   framework::DDim output_matrix_shape = {
       output->dims()[1],
@@ -85,8 +85,8 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   for (int i = 0; i < batch_size; i++) {
     Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
     Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    DLOG << " in_batch.dims() = " << in_batch.dims();
-    DLOG << " out_batch.dims() = " << out_batch.dims();
+//    DLOG << " in_batch.dims() = " << in_batch.dims();
+//    DLOG << " out_batch.dims() = " << out_batch.dims();
 
     for (int g = 0; g < groups; g++) {
       Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
@@ -109,9 +109,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
       // gemm
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      DLOG << " out_slice " << out_slice.dims();
-      DLOG << " filter_slice " << filter_slice.dims();
-      DLOG << " col_matrix " << col_matrix.dims();
+//      DLOG << " out_slice " << out_slice.dims();
+//      DLOG << " filter_slice " << filter_slice.dims();
+//      DLOG << " col_matrix " << col_matrix.dims();
       math::matmul<float>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));

From 251dea7f2768a0bcfeac9fdd163f78fdbcd0f6be Mon Sep 17 00:00:00 2001
From: eclipsess <wowchinasuiyang@163.com>
Date: Thu, 31 May 2018 10:50:55 +0800
Subject: [PATCH 21/26] code style

---
 .../kernel/arm/depthwise_conv_kernel.cpp      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index ff622d13340..1da52fa8d46 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -32,7 +32,7 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   std::vector<int> paddings = param.Paddings();
   std::vector<int> dilations = param.Dilations();
 
-//  DLOG << " compute end get Attrs " << strides[0];
+  //  DLOG << " compute end get Attrs " << strides[0];
 
   const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -59,17 +59,17 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
-//  DLOG << " col_shape = " << col_shape;
-//  DLOG << " col_matrix_shape = " << col_matrix_shape;
+  //  DLOG << " col_shape = " << col_shape;
+  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
 
   framework::DDim input_shape = framework::slice_ddim(
       input->dims(), 1, static_cast<int>(input->dims().size()));
-//  DLOG << " input_shape = " << input_shape;
+  //  DLOG << " input_shape = " << input_shape;
 
   framework::DDim filter_matrix_shape = {filter.dims()[0],
                                          filter.numel() / filter.dims()[0]};
   filter.Resize(filter_matrix_shape);
-//  DLOG << " filter.dims() = " << filter.dims();
+  //  DLOG << " filter.dims() = " << filter.dims();
 
   framework::DDim output_matrix_shape = {
       output->dims()[1],
@@ -85,8 +85,8 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   for (int i = 0; i < batch_size; i++) {
     Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
     Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-//    DLOG << " in_batch.dims() = " << in_batch.dims();
-//    DLOG << " out_batch.dims() = " << out_batch.dims();
+    //    DLOG << " in_batch.dims() = " << in_batch.dims();
+    //    DLOG << " out_batch.dims() = " << out_batch.dims();
 
     for (int g = 0; g < groups; g++) {
       Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
@@ -109,9 +109,9 @@ void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
       // gemm
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-//      DLOG << " out_slice " << out_slice.dims();
-//      DLOG << " filter_slice " << filter_slice.dims();
-//      DLOG << " col_matrix " << col_matrix.dims();
+      //      DLOG << " out_slice " << out_slice.dims();
+      //      DLOG << " filter_slice " << filter_slice.dims();
+      //      DLOG << " col_matrix " << col_matrix.dims();
       math::matmul<float>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));

From b07f7b2d66bcdf6fbcba7fe5331372f9e7ec3891 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 31 May 2018 12:01:47 +0800
Subject: [PATCH 22/26] add split config

---
 src/framework/operator.h                      |   2 +-
 src/framework/program/block_desc.cpp          |   6 +-
 src/framework/program/block_desc.h            |   4 +
 .../program/program-optimize/node.cpp         | 108 ++++++-------
 src/framework/program/program-optimize/node.h |   5 +-
 .../program-optimize/program_optimize.cpp     | 153 +++++++++++++++++-
 .../program-optimize/program_optimize.h       |  16 +-
 src/framework/program/program_desc.cpp        |  10 +-
 8 files changed, 227 insertions(+), 77 deletions(-)

diff --git a/src/framework/operator.h b/src/framework/operator.h
index 8e5e55fb469..6e5a2b089db 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -138,7 +138,7 @@ class FusionOpMatcher : PaddleMobileObject {
 
   virtual Node &BeginNode() { return node_; }
 
-  std::string BeginType() { return node_.BeginType(); }
+  std::string BeginType() { return node_.Type(); }
 
  protected:
   Node node_;
diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp
index 7342abe2844..0ddb9126192 100644
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -26,11 +26,7 @@ std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
 }
 
 std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
-  std::vector<std::shared_ptr<OpDesc>> res;
-  for (const auto &op : ops_) {
-    res.push_back(op);
-  }
-  return res;
+  return ops_;
 }
 
 BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
diff --git a/src/framework/program/block_desc.h b/src/framework/program/block_desc.h
index 1a22714b52a..84d7a90fc11 100644
--- a/src/framework/program/block_desc.h
+++ b/src/framework/program/block_desc.h
@@ -26,6 +26,7 @@ class BlockDesc : PaddleMobileObject {
  public:
   friend class Node;
   friend class ProgramOptimize;
+  BlockDesc() {}
   BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
   BlockDesc(const BlockDesc &block_desc)
       : index_(block_desc.index_), parent_index_(block_desc.parent_index_) {
@@ -43,6 +44,8 @@ class BlockDesc : PaddleMobileObject {
 
   const int &ID() const { return index_; }
 
+  const bool &MultiThread() const { return multi_thread_; }
+
   const int &Parent() const { return parent_index_; }
 
   bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
@@ -58,6 +61,7 @@ class BlockDesc : PaddleMobileObject {
 
  private:
   int index_;
+  bool multi_thread_;
   int parent_index_;
   std::vector<std::shared_ptr<OpDesc>> ops_;
   std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 820fa6a443c..5edde24c598 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -45,17 +45,6 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
-// std::shared_ptr<Node> Node::MatchTheFirstNode(std::string type){
-//
-//  for (const auto &node : outputs_){
-//    if (node->type_ == type){
-//      return node;
-//    }else{
-//
-//    }
-//  }
-//}
-
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
   std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
   OpDescs(size - 1, &op_descs);
@@ -75,21 +64,40 @@ void Node::OpDescs(uint index,
 
 void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                    Node *node, bool adding_thread, int thread_num) {
-  bool can_add_split = false;
   if (outputs_.size() > 1) {
+    adding_thread = false;
+  }
+
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (outputs_.size() > 1 &&
+      op_input_output_key[op_desc_->type_].second.size() == 1) {
     can_add_split = true;
-    if (op_input_output_key[op_desc_->type_].second.size() != 1) {
-      DLOG << "当前 op desc 输出数不为 1 ";
-      can_add_split = false;
-    }
+
+    // 遍历当前节点的 output 节点
     for (const auto &output : outputs_) {
-      if (op_input_output_key.find(output->op_desc_->type_) !=
-          op_input_output_key.end()) {
-        auto inputs_and_outputs = op_input_output_key[output->op_desc_->type_];
-        auto outputs_of_output =
-            output->op_desc_->Output(inputs_and_outputs.second[0]);
-        auto inputs_of_output =
-            output->op_desc_->Input(inputs_and_outputs.first[0]);
+      // 不支持 output 有多个 output 的情况
+      if (output->outputs_.size() > 0) {
+        can_add_split = false;
+        break;
+      }
+
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+              op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+
+        // 判断一下, 如果输入和输出没有同名, 是支持的
         for (int i = 0; i < inputs_of_output.size(); ++i) {
           std::string input_of_output = inputs_of_output[i];
           for (int j = 0; j < outputs_of_output.size(); ++j) {
@@ -101,7 +109,7 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
             }
           }
         }
-      } else {
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
         DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
         can_add_split = false;
       }
@@ -124,12 +132,11 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
 
   if (can_add_split) {
     adding_thread = true;
-    std::shared_ptr<class OpDesc> split_op_desc =
-        std::make_shared<class OpDesc>();
+    std::shared_ptr<OpDesc> split_op_desc =
+        std::make_shared<OpDesc>();
     split_op_desc->type_ = G_OP_TYPE_SPLIT;
     auto outputs = this->op_desc_->Output(
         op_input_output_key[this->op_desc_->Type()].second[0]);
-
     split_op_desc->inputs_ = {
         {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
     auto &split_outputs =
@@ -157,41 +164,12 @@ std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
   return op_descs;
 }
 
-std::string Node::ToString(std::string blank, const Node *node) const {
-  std::stringstream ss;
-  ss << type_ << "-> \n";
-
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return ss.str();
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    ss << "\n" << blank << type_ << "\n";
-  }
-
-  for (int i = 0; i < outputs_.size(); ++i) {
-    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
-  }
-  return ss.str();
-}
-
-std::string Node::ToString() const { return this->ToString("  ", this); }
-
 std::shared_ptr<Node> Node::To(int size) {
   std::shared_ptr<Node> node = std::make_shared<Node>();
   this->To(size - 1, node);
   return node;
 }
 
-// Node &Node::To(int size) {
-//  if (size == 1) {
-//    this->outputs_.clear();
-//  }
-//
-//  for (int j = 0; j < this->outputs_.size(); ++j) {
-//    outputs_[j]->To(size - 1);
-//  }
-//  return *this;
-//}
-
 void Node::To(int index, std::shared_ptr<Node> node) {
   node->type_ = this->type_;
   if (index != 0) {
@@ -268,6 +246,24 @@ void Node::Folder(
   }
 }
 
+std::string Node::ToString(std::string blank, const Node *node) const {
+  std::stringstream ss;
+  ss << type_ << "-> \n";
+
+  if (inputs_.size() > 1 && node != inputs_.back()) {
+    return ss.str();
+  } else if (inputs_.size() > 1 && node == inputs_.back()) {
+    ss << "\n" << blank << type_ << "\n";
+  }
+
+  for (int i = 0; i < outputs_.size(); ++i) {
+    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
+  }
+  return ss.str();
+}
+
+std::string Node::ToString() const { return this->ToString("  ", this); }
+
 void Node::Description() {
   if (op_desc_.get()) {
     DLOG << *op_desc_;
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 5dd1a3acbf5..b7fe9b1f07a 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -27,6 +27,7 @@ namespace paddle_mobile {
 namespace framework {
 
 class Node : PaddleMobileObject {
+  friend class ProgramOptimize;
  public:
   Node() {}
   explicit Node(const std::string &type) : type_(type) {}
@@ -42,8 +43,8 @@ class Node : PaddleMobileObject {
       std::map<std::string, std::pair<std::string, std::string>> change_map);
   std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
   std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
-  std::shared_ptr<framework::OpDesc> OpDesc() { return op_desc_; }
-  std::string BeginType() { return type_; }
+  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
+  std::string Type() { return type_; }
   void Description();
 
  private:
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 737fed9bd56..4c757bac755 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -19,11 +19,12 @@ namespace paddle_mobile {
 
 namespace framework {
 
-// std::shared_ptr<ProgramDesc> ProgramOptimize::Optimize() {}
-
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
-    std::shared_ptr<ProgramDesc> ori_des) {
-  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
+
+//  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+  std::shared_ptr<ProgramDesc> optimize_program = std::make_shared<ProgramDesc>(*ori_des);
+  current_block_ = optimize_program->Blocks().size();
 
   for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
     std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
@@ -96,10 +97,148 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     }
 
     //    DLOG << "node: \n" << *begin_node;
-    block->ops_ = begin_node->OpDescs();
+
+
+    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
+    GenerateOps(&op_descs, begin_node.get());
+    block->ops_ = op_descs;
+  }
+
+  for (int m = 0; m < new_blocks_.size(); ++m) {
+    std::shared_ptr<BlockDesc> new_block = new_blocks_[m];
+    new_block->index_ = m + ori_des->blocks_.size();
+    optimize_program->blocks_.push_back(new_block);
   }
-  std::shared_ptr<ProgramDesc> shared_optimzie(optimize_program);
-  return shared_optimzie;
+  return optimize_program;
 }
+
+
+void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                                  Node *input_node,
+                                  Node *current_node,
+                                  bool adding_thread,
+                                  int thread_num,
+                                  std::shared_ptr<BlockDesc> new_block) {
+  if (current_node->outputs_.size() > 1) {
+    adding_thread = false;
+  }
+
+  bool can_add_split = false;
+  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
+  if (current_node->outputs_.size() > 1 &&
+      op_input_output_key[current_node->op_desc_->type_].second.size() == 1) {
+    can_add_split = true;
+
+    // 遍历当前节点的 output 节点
+    for (const auto &output : current_node->outputs_) {
+      // 不支持 output 有多个 output 的情况
+      if (output->outputs_.size() > 1) {
+        DLOG << "don't support multi output of output";
+        can_add_split = false;
+        break;
+      }
+
+      //与节点关联的 OpDesc
+      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
+
+      //获取这个 op 的 inputs key 和 outputs key
+      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
+
+      //判断现在 是否存在这个 op
+      //判断这个 output 和 input key 的 size 等于 1
+      if (op_input_output_key.find(op_desc->type_) !=
+          op_input_output_key.end() &&
+          inputs_and_outputs.first.size() == 1 &&
+          inputs_and_outputs.second.size() == 1) {
+        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
+        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
+
+        // 判断一下, 如果输入和输出没有同名, 是支持的
+        for (int i = 0; i < inputs_of_output.size(); ++i) {
+          std::string input_of_output = inputs_of_output[i];
+          for (int j = 0; j < outputs_of_output.size(); ++j) {
+            std::string output_of_output = outputs_of_output[j];
+            if (input_of_output == output_of_output) {
+              DLOG << "output的 output 包含 input" << input_of_output;
+              can_add_split = false;
+              break;
+            }
+          }
+        }
+      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
+        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
+        can_add_split = false;
+      }
+    }
+  }
+
+  if (current_node->inputs_.size() > 1 && input_node != current_node->inputs_.back()) {
+    return;
+  } else if (current_node->inputs_.size() > 1 && input_node == current_node->inputs_.back()) {
+    new_block.reset();
+    adding_thread = false;
+    op_desc->push_back(current_node->op_desc_);
+  } else {
+    if (new_block.get() && adding_thread) {
+      new_block->ops_.push_back(current_node->op_desc_);
+    } else {
+      op_desc->push_back(current_node->op_desc_);
+    }
+  }
+  if (adding_thread) {
+    Attribute attr;
+    attr.Set<int>(thread_num);
+    current_node->op_desc_->attrs_["thread"] = attr;
+  }
+
+
+
+  if (can_add_split) {
+    new_block = std::make_shared<BlockDesc>();
+    new_block->multi_thread_ = true;
+    new_block->index_ = current_block_;
+    new_blocks_.push_back(new_block);
+
+    adding_thread = true;
+    std::shared_ptr<OpDesc> split_op_desc =
+            std::make_shared<OpDesc>();
+    split_op_desc->type_ = G_OP_TYPE_SPLIT;
+    auto outputs = current_node->op_desc_->Output(
+            op_input_output_key[current_node->op_desc_->Type()].second[0]);
+    split_op_desc->inputs_ = {
+            {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+    auto &split_outputs =
+            split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+    for (const auto &output : current_node->outputs_) {
+      split_outputs.push_back(outputs[0]);
+    }
+
+    Attribute attr;
+    attr.Set<int>(current_block_);
+    split_op_desc->attrs_["block_id"] = attr;
+
+    op_desc->push_back(split_op_desc);
+    current_block_++;
+  }
+
+  for (int i = 0; i < current_node->outputs_.size(); ++i) {
+    auto &output = current_node->outputs_[i];
+    if (can_add_split) {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread, i, new_block);
+    } else {
+      GenerateOps(op_desc, current_node, output.get(), adding_thread, thread_num, new_block);
+    }
+  }
+}
+
+void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+                                  Node *begin_node) {
+
+
+  //std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+  //             Node *input_node, Node *current_node, bool adding_thread, int thread_num
+  this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
+}
+
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 3839fa1e36b..8ba8d2973fe 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -28,12 +28,20 @@ class ProgramOptimize {
  public:
   ProgramOptimize() {}
   std::shared_ptr<ProgramDesc> FushionOptimize(
-      std::shared_ptr<ProgramDesc> ori_des);
+      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
 
  private:
-  //                std::shared_ptr<ProgramDesc> ori_desc_;
-  std::vector<std::unordered_map<std::string, std::shared_ptr<Node>>>
-      outputs_nodes_;
+  int current_block_;
+  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
+
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+                   Node *begin_node);
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                   Node *input_node,
+                   Node *current_node,
+                   bool adding_thread,
+                   int thread_num,
+                   std::shared_ptr<BlockDesc> new_block);
 };
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/program/program_desc.cpp b/src/framework/program/program_desc.cpp
index 071f5cf5719..31f4bcb6f11 100644
--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -32,11 +32,13 @@ void ProgramDesc::Description(std::string header) {
   if (header.size()) {
     LOG(kLOG_INFO) << header;
   }
-  for (const auto &block : this->blocks_) {
+
+  for (int i = 0; i < this->blocks_.size(); ++i) {
+    auto block = this->blocks_[i];
     LOG(kLOG_DEBUG) << "block: " << block->ID();
     LOG(kLOG_INFO) << "block ops size: " << block->Ops().size();
     for (int j = 0; j < block->Ops().size(); ++j) {
-      const auto &op = block->Ops()[j];
+      auto op = block->Ops()[j];
       LOG(kLOG_DEBUG1) << "op: " << op->Type();
       for (auto &input : op->GetInputs()) {
         LOG(kLOG_DEBUG2) << "input parameter: " << input.first;
@@ -71,6 +73,10 @@ void ProgramDesc::Description(std::string header) {
       }
     }
   }
+
+  for (const auto &block : this->blocks_) {
+
+  }
 #endif
 }
 

From deef88cba43c397d20aa280b51420e3fbf9d2590 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 31 May 2018 12:02:26 +0800
Subject: [PATCH 23/26] format files

---
 src/framework/program/block_desc.cpp          |  4 +-
 .../program/program-optimize/node.cpp         |  3 +-
 src/framework/program/program-optimize/node.h |  1 +
 .../program-optimize/program_optimize.cpp     | 55 +++++++++----------
 .../program-optimize/program_optimize.h       |  7 +--
 src/framework/program/program_desc.cpp        |  1 -
 6 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp
index 0ddb9126192..21322f08256 100644
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -25,9 +25,7 @@ std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
   return res;
 }
 
-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const {
-  return ops_;
-}
+std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
 
 BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
     : index_(desc->idx), parent_index_(desc->idx) {
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 5edde24c598..31377222db8 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -132,8 +132,7 @@ void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
 
   if (can_add_split) {
     adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc =
-        std::make_shared<OpDesc>();
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
     split_op_desc->type_ = G_OP_TYPE_SPLIT;
     auto outputs = this->op_desc_->Output(
         op_input_output_key[this->op_desc_->Type()].second[0]);
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index b7fe9b1f07a..da7e26a9ac0 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -28,6 +28,7 @@ namespace framework {
 
 class Node : PaddleMobileObject {
   friend class ProgramOptimize;
+
  public:
   Node() {}
   explicit Node(const std::string &type) : type_(type) {}
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 4c757bac755..8b0bf295262 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -21,9 +21,9 @@ namespace framework {
 
 std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
-
-//  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
-  std::shared_ptr<ProgramDesc> optimize_program = std::make_shared<ProgramDesc>(*ori_des);
+  //  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
+  std::shared_ptr<ProgramDesc> optimize_program =
+      std::make_shared<ProgramDesc>(*ori_des);
   current_block_ = optimize_program->Blocks().size();
 
   for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
@@ -98,7 +98,6 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
 
     //    DLOG << "node: \n" << *begin_node;
 
-
     std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
     GenerateOps(&op_descs, begin_node.get());
     block->ops_ = op_descs;
@@ -112,13 +111,10 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
   return optimize_program;
 }
 
-
-void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                                  Node *input_node,
-                                  Node *current_node,
-                                  bool adding_thread,
-                                  int thread_num,
-                                  std::shared_ptr<BlockDesc> new_block) {
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+    Node *current_node, bool adding_thread, int thread_num,
+    std::shared_ptr<BlockDesc> new_block) {
   if (current_node->outputs_.size() > 1) {
     adding_thread = false;
   }
@@ -147,7 +143,7 @@ void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>
       //判断现在 是否存在这个 op
       //判断这个 output 和 input key 的 size 等于 1
       if (op_input_output_key.find(op_desc->type_) !=
-          op_input_output_key.end() &&
+              op_input_output_key.end() &&
           inputs_and_outputs.first.size() == 1 &&
           inputs_and_outputs.second.size() == 1) {
         auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
@@ -172,9 +168,11 @@ void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>
     }
   }
 
-  if (current_node->inputs_.size() > 1 && input_node != current_node->inputs_.back()) {
+  if (current_node->inputs_.size() > 1 &&
+      input_node != current_node->inputs_.back()) {
     return;
-  } else if (current_node->inputs_.size() > 1 && input_node == current_node->inputs_.back()) {
+  } else if (current_node->inputs_.size() > 1 &&
+             input_node == current_node->inputs_.back()) {
     new_block.reset();
     adding_thread = false;
     op_desc->push_back(current_node->op_desc_);
@@ -191,8 +189,6 @@ void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>
     current_node->op_desc_->attrs_["thread"] = attr;
   }
 
-
-
   if (can_add_split) {
     new_block = std::make_shared<BlockDesc>();
     new_block->multi_thread_ = true;
@@ -200,15 +196,14 @@ void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>
     new_blocks_.push_back(new_block);
 
     adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc =
-            std::make_shared<OpDesc>();
+    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
     split_op_desc->type_ = G_OP_TYPE_SPLIT;
     auto outputs = current_node->op_desc_->Output(
-            op_input_output_key[current_node->op_desc_->Type()].second[0]);
+        op_input_output_key[current_node->op_desc_->Type()].second[0]);
     split_op_desc->inputs_ = {
-            {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
+        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
     auto &split_outputs =
-            split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
+        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
     for (const auto &output : current_node->outputs_) {
       split_outputs.push_back(outputs[0]);
     }
@@ -224,19 +219,21 @@ void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>
   for (int i = 0; i < current_node->outputs_.size(); ++i) {
     auto &output = current_node->outputs_[i];
     if (can_add_split) {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread, i, new_block);
+      GenerateOps(op_desc, current_node, output.get(), adding_thread, i,
+                  new_block);
     } else {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread, thread_num, new_block);
+      GenerateOps(op_desc, current_node, output.get(), adding_thread,
+                  thread_num, new_block);
     }
   }
 }
 
-void ProgramOptimize::GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-                                  Node *begin_node) {
-
-
-  //std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-  //             Node *input_node, Node *current_node, bool adding_thread, int thread_num
+void ProgramOptimize::GenerateOps(
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+    Node *begin_node) {
+  // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+  //             Node *input_node, Node *current_node, bool adding_thread, int
+  //             thread_num
   this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
 }
 
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 8ba8d2973fe..32d8d1fa914 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -37,11 +37,8 @@ class ProgramOptimize {
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
                    Node *begin_node);
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *input_node,
-                   Node *current_node,
-                   bool adding_thread,
-                   int thread_num,
-                   std::shared_ptr<BlockDesc> new_block);
+                   Node *input_node, Node *current_node, bool adding_thread,
+                   int thread_num, std::shared_ptr<BlockDesc> new_block);
 };
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/program/program_desc.cpp b/src/framework/program/program_desc.cpp
index 31f4bcb6f11..8483e1e5d68 100644
--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -75,7 +75,6 @@ void ProgramDesc::Description(std::string header) {
   }
 
   for (const auto &block : this->blocks_) {
-
   }
 #endif
 }

From 0d53906fd7db31a533156786e593c7d0e8c51320 Mon Sep 17 00:00:00 2001
From: wangliu <wangliu@baidu.com>
Date: Thu, 31 May 2018 15:39:04 +0800
Subject: [PATCH 24/26] fix compile error on ubuntu

---
 src/framework/operator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 46feb97cb87..f798d7ade20 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -48,7 +48,7 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
-#if (PADDLE_MOBILE_DEBUG)
+#ifdef PADDLE_MOBILE_DEBUG
   vector<string> output_keys = GetOutKeys();
   for (const auto key : output_keys) {
     Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);

From 2b92e037ff6fdf4cb2f3102c71e03995935fb90d Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 31 May 2018 16:23:51 +0800
Subject: [PATCH 25/26] fix fc crash

---
 src/common/types.h                            |  3 +-
 .../program/program-optimize/node.cpp         | 41 +++++++++++++++++++
 src/framework/program/program-optimize/node.h |  5 +++
 .../program-optimize/program_optimize.cpp     | 29 ++++++++++++-
 .../program-optimize/program_optimize.h       |  4 +-
 src/io.cpp                                    |  8 +++-
 src/operators/fusion_conv_add_relu_op.h       |  6 +--
 src/operators/fusion_fc_op.h                  |  6 +--
 test/net/test_googlenet.cpp                   |  5 ++-
 9 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/src/common/types.h b/src/common/types.h
index ca9e64cc60f..5e651a89517 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -77,7 +77,7 @@ static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
 static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
 static const std::string G_OP_TYPE_CONCAT = "concat";
 static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "FusionConvAddRelu";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
 static const std::string G_OP_TYPE_FC = "fc";
 static const std::string G_OP_TYPE_LRN = "lrn";
 static const std::string G_OP_TYPE_MUL = "mul";
@@ -92,6 +92,7 @@ static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
 static const std::string G_OP_TYPE_SPLIT = "split";
 static const std::string G_OP_TYPE_FEED = "feed";
 static const std::string G_OP_TYPE_FETCH = "fetch";
+static const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
 
 static std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 31377222db8..3910dc7a0b1 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -45,6 +45,47 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
+bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) {
+  bool split = false;
+  CanSplit(&split, false, 0, &complex_compute_set, this);
+  return split;
+}
+
+void Node::CanSplit(bool *split, bool spliting,
+                    int complex_count,
+                    std::unordered_set<std::string> *complex_compute_set, Node *pre_node) {
+  if (spliting) {
+    if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
+      complex_count++;
+    }
+  }
+
+  if (inputs_.size() > 1 && pre_node != inputs_.back()) {
+    return;
+  }
+  if (inputs_.size() > 1 && pre_node == inputs_.back()) {
+    if (complex_count > 1) {
+      *split = true;
+      return;
+    }
+  }
+
+  // multi output, to check
+  if (outputs_.size() > 1) {
+    spliting = true;
+    complex_compute_set = 0;
+  } else {
+    if (spliting == true && inputs_.size() > 0) {
+      spliting = false;
+    } else {
+    }
+  }
+
+  for (auto &output : outputs_) {
+    output->CanSplit(split, spliting, complex_count, complex_compute_set, this);
+  }
+}
+
 std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
   std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
   OpDescs(size - 1, &op_descs);
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index da7e26a9ac0..914cb19589d 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+#include <unordered_set>
 
 #include "common/log.h"
 #include "framework/paddle_mobile_object.h"
@@ -36,6 +37,7 @@ class Node : PaddleMobileObject {
       : op_desc_(op_desc), type_(op_desc->Type()) {}
   Node &operator>(std::shared_ptr<Node> node);
   bool operator==(const Node &in);
+  bool CanSplit(std::unordered_set<std::string> complex_compute_set);
   std::string ToString() const;
   std::shared_ptr<Node> To(int size);
   uint Depth(uint begin = 0);
@@ -49,6 +51,9 @@ class Node : PaddleMobileObject {
   void Description();
 
  private:
+  void CanSplit(bool *split, bool spliting,
+                int complex_count,
+                std::unordered_set<std::string> *complex_compute_set, Node *pre_node);
   void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                Node *node, bool adding_thread, int thread_num);
   void OpDescs(uint size,
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 8b0bf295262..11f9b17ad55 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -99,6 +99,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     //    DLOG << "node: \n" << *begin_node;
 
     std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
+  //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV, G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
     GenerateOps(&op_descs, begin_node.get());
     block->ops_ = op_descs;
   }
@@ -111,6 +112,28 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
   return optimize_program;
 }
 
+
+void ProgramOptimize::GenerateOps(
+        std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+        Node *current_node) {
+
+  if (current_node->inputs_.size() > 1 &&
+      input_node != current_node->inputs_.back()) {
+    return;
+  } else if (current_node->inputs_.size() > 1 &&
+             input_node == current_node->inputs_.back()) {
+    op_desc->push_back(current_node->op_desc_);
+  } else {
+    op_desc->push_back(current_node->op_desc_);
+  }
+
+  for (int i = 0; i < current_node->outputs_.size(); ++i) {
+    auto &output = current_node->outputs_[i];
+    GenerateOps(op_desc, current_node, output.get());
+  }
+
+}
+
 void ProgramOptimize::GenerateOps(
     std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
     Node *current_node, bool adding_thread, int thread_num,
@@ -234,7 +257,11 @@ void ProgramOptimize::GenerateOps(
   // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
   //             Node *input_node, Node *current_node, bool adding_thread, int
   //             thread_num
-  this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
+  if (false) {
+    this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
+  } else {
+    this->GenerateOps(op_descs, begin_node, begin_node);
+  }
 }
 
 }  // namespace framework
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 32d8d1fa914..701358f5905 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -33,9 +33,11 @@ class ProgramOptimize {
  private:
   int current_block_;
   std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
-
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
                    Node *begin_node);
+  void GenerateOps(
+          std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+          Node *current_node);
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                    Node *input_node, Node *current_node, bool adding_thread,
                    int thread_num, std::shared_ptr<BlockDesc> new_block);
diff --git a/src/io.cpp b/src/io.cpp
index ac89106e498..c99556f0865 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -220,13 +220,18 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       }
     }
   }
-  originProgramDesc->Description("program: ");
 
   if (optimize) {
     framework::ProgramOptimize program_optimize;
     program.optimizeProgram =
         program_optimize.FushionOptimize(originProgramDesc);
   }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
@@ -254,6 +259,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
     for (int j = 0; j < ops.size(); ++j) {
       std::shared_ptr<framework::OpDesc> op = ops[j];
+      DLOG << "create op: " << op->Type();
       auto op_base = framework::OpRegistry<Dtype>::CreateOp(
           op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
           program_.scope);
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 1fa3399cf22..0f52562f0bc 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -28,10 +28,10 @@ class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
         std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
-  void FolderNodes(framework::Node &node) {
+  void FolderNodes(framework::Node *node) {
     std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(),
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index fb49fa61b20..fe628631447 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -32,10 +32,10 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
     node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
   }
 
-  void FolderNodes(framework::Node &node) {
+  void FolderNodes(framework::Node *node)  {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node.OpDescs(node_.Depth());
-    node.Folder(node_.Depth(), Type(),
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
 
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 0640af890cf..302cd3e726e 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -18,11 +18,12 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  bool optimize = true;
   auto time1 = time();
-  auto program = loader.Load(g_googlenet, false);
+  auto program = loader.Load(g_googlenet, optimize);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
   std::vector<float> input;
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);

From ddd8e462cbbe6a11ba98de01144c97c137d7f16a Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 31 May 2018 16:25:58 +0800
Subject: [PATCH 26/26] format files

---
 src/common/types.h                                     |  3 ++-
 src/framework/program/program-optimize/node.cpp        |  6 +++---
 src/framework/program/program-optimize/node.h          |  8 ++++----
 .../program/program-optimize/program_optimize.cpp      | 10 ++++------
 .../program/program-optimize/program_optimize.h        |  5 ++---
 src/io.cpp                                             |  1 -
 src/operators/fusion_conv_add_relu_op.h                |  2 +-
 src/operators/fusion_fc_op.h                           |  4 ++--
 8 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/common/types.h b/src/common/types.h
index 5e651a89517..04b78947a6a 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -77,7 +77,8 @@ static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
 static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
 static const std::string G_OP_TYPE_CONCAT = "concat";
 static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU =
+    "fusion_conv_add_relu";
 static const std::string G_OP_TYPE_FC = "fc";
 static const std::string G_OP_TYPE_LRN = "lrn";
 static const std::string G_OP_TYPE_MUL = "mul";
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 3910dc7a0b1..c165b6568aa 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -51,9 +51,9 @@ bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) {
   return split;
 }
 
-void Node::CanSplit(bool *split, bool spliting,
-                    int complex_count,
-                    std::unordered_set<std::string> *complex_compute_set, Node *pre_node) {
+void Node::CanSplit(bool *split, bool spliting, int complex_count,
+                    std::unordered_set<std::string> *complex_compute_set,
+                    Node *pre_node) {
   if (spliting) {
     if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
       complex_count++;
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 914cb19589d..8ef26f897d2 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <map>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
-#include <unordered_set>
 
 #include "common/log.h"
 #include "framework/paddle_mobile_object.h"
@@ -51,9 +51,9 @@ class Node : PaddleMobileObject {
   void Description();
 
  private:
-  void CanSplit(bool *split, bool spliting,
-                int complex_count,
-                std::unordered_set<std::string> *complex_compute_set, Node *pre_node);
+  void CanSplit(bool *split, bool spliting, int complex_count,
+                std::unordered_set<std::string> *complex_compute_set,
+                Node *pre_node);
   void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                Node *node, bool adding_thread, int thread_num);
   void OpDescs(uint size,
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 11f9b17ad55..d9c3c51c3c8 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -99,7 +99,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     //    DLOG << "node: \n" << *begin_node;
 
     std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV, G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
+    //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV,
+    //    G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
     GenerateOps(&op_descs, begin_node.get());
     block->ops_ = op_descs;
   }
@@ -112,11 +113,9 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
   return optimize_program;
 }
 
-
 void ProgramOptimize::GenerateOps(
-        std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-        Node *current_node) {
-
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
+    Node *current_node) {
   if (current_node->inputs_.size() > 1 &&
       input_node != current_node->inputs_.back()) {
     return;
@@ -131,7 +130,6 @@ void ProgramOptimize::GenerateOps(
     auto &output = current_node->outputs_[i];
     GenerateOps(op_desc, current_node, output.get());
   }
-
 }
 
 void ProgramOptimize::GenerateOps(
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 701358f5905..93943cf8395 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -35,9 +35,8 @@ class ProgramOptimize {
   std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
                    Node *begin_node);
-  void GenerateOps(
-          std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-          Node *current_node);
+  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
+                   Node *input_node, Node *current_node);
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                    Node *input_node, Node *current_node, bool adding_thread,
                    int thread_num, std::shared_ptr<BlockDesc> new_block);
diff --git a/src/io.cpp b/src/io.cpp
index c99556f0865..8f6a07f2dd1 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -232,7 +232,6 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
     originProgramDesc->Description("program: ");
   }
 
-
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
 }
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 0f52562f0bc..e93c910d2b3 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -32,7 +32,7 @@ class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
     std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
-                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index fe628631447..9019ef4d496 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -32,11 +32,11 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
     node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
   }
 
-  void FolderNodes(framework::Node *node)  {
+  void FolderNodes(framework::Node *node) {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
-                {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
   }
 
   std::string Type() { return G_OP_TYPE_FC; }