From 0e6ddcc7bc63eb6ddfe5f12f4d9060625befe41a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 10:01:10 +0800
Subject: [PATCH 01/38] ENH: Add GPU throw error

---
 paddle/platform/error.h | 87 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 paddle/platform/error.h
diff --git a/paddle/platform/error.h b/paddle/platform/error.h
new file mode 100644
index 0000000000000..93424bb610965
--- /dev/null
+++ b/paddle/platform/error.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle

From d3b77a5bc053b77309ecc094450e755604217674 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:38 +0800
Subject: [PATCH 02/38] ENH: Add Gpu info

---
 paddle/platform/gpu_info.cc | 49 +++++++++++++++++++++++++++++++++++++
 paddle/platform/gpu_info.h  | 36 +++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 paddle/platform/gpu_info.cc
 create mode 100644 paddle/platform/gpu_info.h

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000000..4208d83078c90
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GpuDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+  return count;
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  return total * FLAGS_fraction_of_gpu_memory_to_use;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000000..174f093b435c9
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of GPU devices in system.
+int GpuDeviceCount();
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t& available, size_t& total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU

From b29923f902dc6da1416a94bc153448f1546e62b2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 13:56:57 +0800
Subject: [PATCH 03/38] ENH: Add CPU info

---
 paddle/platform/cpu_info.cc | 55 +++++++++++++++++++++++++++++++++++++
 paddle/platform/cpu_info.h  | 26 ++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 paddle/platform/cpu_info.cc
 create mode 100644 paddle/platform/cpu_info.h

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000000..deff76502e8cb
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuTotalMemory() {
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
new file mode 100644
index 0000000000000..3b768589e150f
--- /dev/null
+++ b/paddle/platform/cpu_info.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total memory on host machine.
+size_t CpuTotalMemory();
+
+}  // namespace platform
+}  // namespace paddle

From 169022d0148a77cd10f16a82e841a75750e7e173 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 14:04:47 +0800
Subject: [PATCH 04/38] FIX: Improve fallback gpu allocator

---
 paddle/memory/detail/CMakeLists.txt           |  4 +-
 paddle/memory/detail/system_allocator.cc      | 64 ++++++++++++++-----
 paddle/memory/detail/system_allocator.h       | 15 +++--
 paddle/memory/detail/system_allocator_test.cc | 14 ++--
 paddle/platform/CMakeLists.txt                |  4 ++
 paddle/platform/cpu_info_test.cc              | 18 ++++++
 paddle/platform/cuda.h                        | 40 ------------
 7 files changed, 85 insertions(+), 74 deletions(-)
 create mode 100644 paddle/platform/cpu_info_test.cc
 delete mode 100644 paddle/platform/cuda.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad789e..6caa97a76bbfd 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,6 +1,8 @@
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_test(system_allocator_test
+    SRCS system_allocator_test.cc
+    DEPS system_allocator gpu_info gflags)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f83de..332ff062d4784 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,32 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  if (FLAGS_use_pinned_memory) {
+    void* p = malloc(size);
+    if (p != nullptr) {
+      mlock(p, size);
+    }
+  }
+
   void* p = malloc(size);
   if (p != nullptr && FLAGS_use_pinned_memory) {
     mlock(p, size);
@@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) {
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
   if (p != nullptr && FLAGS_use_pinned_memory) {
     munlock(p, size);
   }
@@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) {
 
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t remaining = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= remaining) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      total_alloc_size_ += size;
+      return p;
+    }
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    total_alloc_size_ += size;
+    return p;
+  }
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  PADDLE_ASSERT(total_alloc_size_ >= size);
+  total_alloc_size_ -= size;
+  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
     platform::throw_on_error(err, "cudaFree{Host} failed");
   }
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f782..e15302ce4f0ae 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -30,21 +30,24 @@ namespace detail {
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+
+ private:
+  size_t total_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e4d1..ba44e06ddb68e 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7abe2ab89e079..17342356d6018 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,3 +1,7 @@
+cc_library(cpu_info SRCS cpu_info.cc)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+
+nv_library(gpu_info SRCS gpu_info.cc)
 nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000000..5b7ce7c763e39
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,18 @@
+#include "paddle/platform/cpu_info.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
+  ss << std::to_string(
+            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
+     << "% of CPU Memory Usage: " << mem_size << " GB";
+  std::cout << ss.str();
+}
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
deleted file mode 100644
index 8fe891f9ce6c3..0000000000000
--- a/paddle/platform/cuda.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace paddle {
-namespace platform {
-
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU

From e6c14f7e000d047cf3d3a1e18e2a13e3349b1ff9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:03 +0800
Subject: [PATCH 05/38] ENH: Polish cpu info interface

---
 paddle/platform/CMakeLists.txt   |  3 +-
 paddle/platform/cpu_info.cc      | 14 +++++++-
 paddle/platform/cpu_info.h       | 10 ++++--
 paddle/platform/cpu_info_test.cc | 13 ++++---
 paddle/platform/cuda_test.cu     | 59 --------------------------------
 5 files changed, 30 insertions(+), 69 deletions(-)
 delete mode 100644 paddle/platform/cuda_test.cu

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d0bedf6ba921a..969c91985d069 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc)
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
 nv_library(gpu_info SRCS gpu_info.cc)
-nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index deff76502e8cb..3da04420e57c3 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -47,9 +47,21 @@ inline size_t CpuTotalPhysicalMemory() {
 #endif
 }
 
-size_t CpuTotalMemory() {
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
   return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index 3b768589e150f..8df7c7b4bca5b 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -19,8 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Get the total memory on host machine.
-size_t CpuTotalMemory();
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
index 5b7ce7c763e39..8fb195aa7c0a4 100644
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
@@ -1,18 +1,21 @@
 #include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
 
 #include <ostream>
 #include <sstream>
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
 TEST(CpuMemoryUsage, Print) {
   std::stringstream ss;
-  size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024;
-  ss << std::to_string(
-            static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use * 100))
-     << "% of CPU Memory Usage: " << mem_size << " GB";
-  std::cout << ss.str();
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
 }
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f19f7..0000000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}

From 6e7209f0584f73eb22313d98c676333379736d1e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:30:32 +0800
Subject: [PATCH 06/38] ENH: Add gpu info interface

---
 paddle/platform/gpu_info.cc | 25 +++++++++++++++++++++++++
 paddle/platform/gpu_info.h  |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 4208d83078c90..d6c6fe34ef41e 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -42,8 +42,33 @@ size_t GpuMaxAllocSize() {
 
   GpuMemoryUsage(available, total);
 
+  // Reserve the rest for page tables, etc.
   return total * FLAGS_fraction_of_gpu_memory_to_use;
 }
 
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 174f093b435c9..d7bf0f4093ef3 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -30,6 +30,12 @@ void GpuMemoryUsage(size_t& available, size_t& total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
 }  // namespace platform
 }  // namespace paddle
 

From 464886bf56cd91ffcd6617390d62dbd13c90a093 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 16:31:05 +0800
Subject: [PATCH 07/38] FIX: fix typo in piece.h

---
 paddle/string/piece.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e69804a6..0272529d1c9b2 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ class Piece {
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);

From 26cd0bb5a59d913f8c216ceee0c6abb46317e31e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:13:24 +0800
Subject: [PATCH 08/38] ENH: count allocated fallback size for performance

---
 paddle/memory/detail/system_allocator.cc      | 52 +++++++++++++------
 paddle/memory/detail/system_allocator.h       |  3 +-
 .../paddle/trainer_config_helpers/networks.py |  4 +-
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 332ff062d4784..2b0fbfa87e22a 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -39,22 +39,22 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
-  if (FLAGS_use_pinned_memory) {
-    void* p = malloc(size);
-    if (p != nullptr) {
-      mlock(p, size);
-    }
-  }
+  index = 0;  // unlock memory
 
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
 void CPUAllocator::Free(void* p, size_t size, size_t index) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
@@ -73,26 +73,34 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 
   // Reserve memory for page tables, etc.
   size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
-  size_t remaining = available > reserving ? available - reserving : 0;
+  size_t usable = available > reserving ? available - reserving : 0;
 
   // If remaining size no less than expected size, using general
   // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  if (size <= remaining) {
+  if (size <= usable) {
     cudaError_t result = cudaMalloc(&p, size);
     if (result == cudaSuccess) {
       index = 0;
-      total_alloc_size_ += size;
+      gpu_alloc_size_ += size;
       return p;
     }
   }
 
   // If remaining size less than expected size or cudaMalloc failed,
   // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
   cudaError_t result = cudaMallocHost(&p, size);
   if (result == cudaSuccess) {
     index = 1;
-    total_alloc_size_ += size;
+    fallback_alloc_size_ += size;
     return p;
   }
 
@@ -100,16 +108,26 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  PADDLE_ASSERT(total_alloc_size_ >= size);
-  total_alloc_size_ -= size;
-  cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index e15302ce4f0ae..7093c42967162 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -47,7 +47,8 @@ class GPUAllocator : public SystemAllocator {
   virtual void Free(void* p, size_t size, size_t index);
 
  private:
-  size_t total_alloc_size_ = 0;
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 67154a8d7d366..1bf59ed4840ae 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1381,7 +1381,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1424,7 +1424,7 @@ def __dfs_travel__(layer,
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From fb51c3dc895b78df966dd0d9713657289b1986b3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Jun 2017 19:57:10 +0800
Subject: [PATCH 09/38] FIX: add compile dependency gflags

---
 paddle/platform/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 969c91985d069..5cbe491b2b57f 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
-cc_library(cpu_info SRCS cpu_info.cc)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog)
 
-nv_library(gpu_info SRCS gpu_info.cc)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

From 275e5b7d42903ea3c9bf4e4fed3f9eab45c727bf Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 11:12:18 +0800
Subject: [PATCH 10/38] FIX: yapf format version

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index f0b6625dc3736..b77932ce5f094 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1438,7 +1438,7 @@ def __dfs_travel__(layer,
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 89110fd2660098bc949a1f13f7b53515e0c931a3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:51:32 +0800
Subject: [PATCH 11/38] ENH: Add useGpu in system allocator

---
 paddle/memory/detail/system_allocator.cc | 4 ++++
 paddle/memory/detail/system_allocator.h  | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 2b0fbfa87e22a..75a2c91ef938e 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,6 +60,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
+bool CPUAllocator::UseGpu() { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
@@ -131,6 +133,8 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
+bool GPUAllocator::UseGpu() { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 7093c42967162..f3bbfef843542 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,12 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu();
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -45,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-
+  virtual bool UseGpu();
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From 929f9cbdff08090a222495db7db601f164cebb8c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:52:04 +0800
Subject: [PATCH 12/38] ENH: Add Metadata for memory block

---
 paddle/memory/detail/metadata.cc | 62 ++++++++++++++++++++++++++++++++
 paddle/memory/detail/metadata.h  | 53 +++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 paddle/memory/detail/metadata.cc
 create mode 100644 paddle/memory/detail/metadata.h

diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/metadata.cc
new file mode 100644
index 0000000000000..4607cd8512ef5
--- /dev/null
+++ b/paddle/memory/detail/metadata.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/metadata.h
new file mode 100644
index 0000000000000..ddb826571b6d6
--- /dev/null
+++ b/paddle/memory/detail/metadata.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle

From bbd3eab7ee88f02131edb41738a966aa0f1a0e88 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 3 Jul 2017 19:54:32 +0800
Subject: [PATCH 13/38] ENH: Add Alloc for buddy Allocator

* Free will be added soon
---
 paddle/memory/detail/buddy_allocator.cc | 157 ++++++++++++++++++++++--
 paddle/memory/detail/buddy_allocator.h  |  88 +++++++++----
 2 files changed, 209 insertions(+), 36 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5eea49..2462ba084b996 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,161 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size) {
+  PADDLE_ASSERT(min_chunk_size > 0);
+  PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
+
+  system_allocator_ = std::move(system_allocator);
+  min_chunk_size_ = min_chunk_size;
+  max_chunk_size_ = max_chunk_size;
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+  } else {
+    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  // if still failure, fail fatally
+  if (it == pool_.end()) {
+    return nullptr;
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << " Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert({index, max_chunk_size_, p}).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound({index, size, nullptr});
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+
+  pool_.erase(it);
+
+  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+
+  block->split(cache_, size);
+
+  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
+                 << ", " << block->right_buddy(cache_)->total_size(cache_)
+                 << ")";
+
+      pool_.insert({block->right_buddy(cache_)->index(cache_),
+                    block->right_buddy(cache_)->total_size(cache_),
+                    block->right_buddy(cache_)});
+    }
+  }
+
+  return block;
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc7199..38bedc9a18366 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -15,9 +15,15 @@
 #pragma once
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/metadata.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
+#include <set>
 #include <mutex>
 #include <vector>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
@@ -25,55 +31,83 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple type: allocator index, memory size, memory address
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /** 
+   *  \brief Find the suitable chunk from existing pool
+   *  
+   *  \param it   pool iterator which contains suitable block.
+   *  \param size the size of allocation.
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation  */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  PoolSet pool_;
+
+ private:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+  MetadataCache cache_;
+
+ private:
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
 };
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = nullptr;
   if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
+    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
+                           platform::CpuMaxChunkSize());
   }
   return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
 
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    int gpu_num = platform::GpuDeviceCount();
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] =
+          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize());
     }
   }
   return as[gpu_id];

From 4e1617d05994bda1a9eb0e0b5b563249cc12f271 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 13:15:00 +0800
Subject: [PATCH 14/38] ENH: add buddy alloctor Free

---
 paddle/memory/detail/CMakeLists.txt     |   3 +
 paddle/memory/detail/buddy_allocator.cc |  19 ++--
 paddle/memory/detail/buddy_allocator.h  |  10 +-
 paddle/memory/detail/memory_block.cc    | 145 ++++++++++++++++++++++++
 paddle/memory/detail/memory_block.h     |  97 ++++++++++++++++
 paddle/memory/detail/system_allocator.h |   1 +
 6 files changed, 262 insertions(+), 13 deletions(-)
 create mode 100644 paddle/memory/detail/memory_block.cc
 create mode 100644 paddle/memory/detail/memory_block.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6caa97a76bbfd..dbc98a8a62fb3 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -7,3 +7,6 @@ else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
   cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+
+cc_library(metadata SRCS metadata.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 2462ba084b996..e8d694327d8ce 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -58,17 +58,16 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // refill the pool if failure
   if (it == pool_.end()) {
     it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
   } else {
     DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
-  // if still failure, fail fatally
-  if (it == pool_.end()) {
-    return nullptr;
-  }
-
   total_used_ += size;
   total_free_ -= size;
 
@@ -76,6 +75,13 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
 }
 
+void BuddyAllocator::Free(void* p) {
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+}
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -140,17 +146,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
                                    size_t size) {
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
-
   pool_.erase(it);
 
   DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
-
   block->split(cache_, size);
 
   DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
              << ")";
-
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 38bedc9a18366..4006bdcce8d1c 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
 #include "paddle/platform/gpu_info.h"
 
-#include <set>
 #include <mutex>
-#include <vector>
+#include <set>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace memory {
@@ -57,9 +57,9 @@ class BuddyAllocator {
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
 
-  /** 
+  /**
    *  \brief Find the suitable chunk from existing pool
-   *  
+   *
    *  \param it   pool iterator which contains suitable block.
    *  \param size the size of allocation.
    */
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000000..1c9e87df49703
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,145 @@
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this,
+              MemoryBlockMetadata(t, index, size - overhead(), size,
+                                  static_cast<MemoryBlock*>(left_buddy),
+                                  static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  assert(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= overhead()) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlockMetadata(FREE_MEMORY, index(cache),
+                                  remaining_size - overhead(), remaining_size,
+                                  this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - overhead();
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  assert(type(cache) == FREE_MEMORY);
+  assert(right_buddy->type(cache) == FREE_MEMORY);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy,
+              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  assert(type(cache) != FREE_MEMORY);
+  assert(type(cache) != INVALID_MEMORY);
+
+  set_type(cache, FREE_MEMORY);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<MemoryBlockMetadata*>(
+             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
+         1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+}
+
+}  // detail
+}  // memory
+}  // paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000000..e2d39c31cfbca
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/metadata.h"
+
+#include <cstddef>
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declaration
+class Metadata;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  // Unify the metadata format between GPU and CPU allocations
+  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
+
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index f3bbfef843542..555061a533ffe 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -48,6 +48,7 @@ class GPUAllocator : public SystemAllocator {
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu();
+
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;

From ff36389452c1af6cc6a5f03b5ca52404ab20f108 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 15:21:24 +0800
Subject: [PATCH 15/38] ENH: code style

---
 paddle/memory/detail/buddy_allocator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index e8d694327d8ce..eddfd9d13c71b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -48,7 +48,6 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
     DLOG(INFO) << "Allocate from system allocator.";
-
     return SystemAlloc(size);
   }
 

From 379434b243faeaf9fd4d38cf9f95dfe45cc563d5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 17:21:09 +0800
Subject: [PATCH 16/38] Delete cmake in dynload

---
 paddle/platform/dynload/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 paddle/platform/dynload/CMakeLists.txt

diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
deleted file mode 100644
index 9f829b7012865..0000000000000
--- a/paddle/platform/dynload/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)

From 0ba63475659822bd146f1f1dcfc7eabca8b7047d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 21:23:23 +0800
Subject: [PATCH 17/38] ENH: Add buddy allocator Free

---
 paddle/memory/detail/CMakeLists.txt           |  9 ++--
 paddle/memory/detail/buddy_allocator.cc       | 41 ++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h        |  2 +-
 paddle/memory/detail/memory_block.h           |  2 +-
 .../detail/{metadata.cc => meta_data.cc}      |  2 +-
 .../memory/detail/{metadata.h => meta_data.h} |  0
 paddle/platform/cpu_info.h                    | 10 -----
 paddle/platform/gpu_info.cc                   | 13 ++++++
 paddle/platform/gpu_info.h                    |  6 +++
 9 files changed, 65 insertions(+), 20 deletions(-)
 rename paddle/memory/detail/{metadata.cc => meta_data.cc} (97%)
 rename paddle/memory/detail/{metadata.h => meta_data.h} (100%)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index dbc98a8a62fb3..c3167cd30aa31 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,12 +1,9 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test
-    SRCS system_allocator_test.cc
-    DEPS system_allocator gpu_info gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
-cc_library(metadata SRCS metadata.cc)
+cc_library(meta_data SRCS meta_data.cc)
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index eddfd9d13c71b..f677feda0d559 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -75,10 +75,49 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
 }
 
 void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
   auto block = static_cast<MemoryBlock*>(p)->metadata();
 
-  // acquire the allocator lock
+  // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    if (system_allocator_->UseGpu()) {
+      cache_.erase(block);
+    }
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+  }
+
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+
+  // TODO(gangliao): Clean up if existing too much free memory
 }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 4006bdcce8d1c..49bd6cf9019e7 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cpu_info.h"
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index e2d39c31cfbca..2945520113ae4 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <cstddef>
 #include <unordered_map>
diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/meta_data.cc
similarity index 97%
rename from paddle/memory/detail/metadata.cc
rename to paddle/memory/detail/meta_data.cc
index 4607cd8512ef5..a3b7a9b4fed94 100644
--- a/paddle/memory/detail/metadata.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/memory/detail/metadata.h"
+#include "paddle/memory/detail/meta_data.h"
 
 #include <functional>
 
diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/meta_data.h
similarity index 100%
rename from paddle/memory/detail/metadata.h
rename to paddle/memory/detail/meta_data.h
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
index edd76517a6481..8df7c7b4bca5b 100644
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
@@ -28,15 +28,5 @@ size_t CpuMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
 
-int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
-
-void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index d6c6fe34ef41e..05a243c50696a 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -31,6 +31,19 @@ int GpuDeviceCount() {
   return count;
 }
 
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
 void GpuMemoryUsage(size_t& available, size_t& total) {
   throw_on_error(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index d7bf0f4093ef3..81ee5f6e0a95e 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -24,6 +24,12 @@ namespace platform {
 //! Get the total number of GPU devices in system.
 int GpuDeviceCount();
 
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
 //！Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t& available, size_t& total);
 

From 4dc3c9e0cc1b6ec5dbc324f4804974247ca6506f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 4 Jul 2017 23:28:15 +0800
Subject: [PATCH 18/38] ENH: Add paddle_memory for external usage

---
 paddle/memory/CMakeLists.txt            | 10 ++++
 paddle/memory/detail/CMakeLists.txt     |  6 +++
 paddle/memory/detail/buddy_allocator.cc | 12 ++---
 paddle/memory/detail/buddy_allocator.h  |  2 +-
 paddle/memory/detail/memory_block.cc    | 56 +++++++++++--------
 paddle/memory/detail/memory_block.h     | 10 +---
 paddle/memory/detail/meta_cache.cc      | 57 ++++++++++++++++++++
 paddle/memory/detail/meta_cache.h       | 71 +++++++++++++++++++++++++
 paddle/memory/detail/meta_data.cc       |  8 +++
 paddle/memory/detail/meta_data.h        |  1 +
 10 files changed, 196 insertions(+), 37 deletions(-)
 create mode 100644 paddle/memory/detail/meta_cache.cc
 create mode 100644 paddle/memory/detail/meta_cache.h

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad31d..8c290712fc9fb 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,11 @@
 add_subdirectory(detail)
+
+cc_library(memory
+    SRCS
+    memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c3167cd30aa31..4fdabc8eebd4f 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -3,7 +3,13 @@ if(${WITH_GPU})
 else(${WITH_GPU})
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
 endif(${WITH_GPU})
+
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
 cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
 cc_library(buddy_allocator SRCS buddy_allocator.cc)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index f677feda0d559..aa5b6b557c5dd 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -20,14 +20,14 @@ namespace memory {
 namespace detail {
 
 BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size) {
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {
   PADDLE_ASSERT(min_chunk_size > 0);
   PADDLE_ASSERT(max_chunk_size > 0);
   PADDLE_ASSERT(system_allocator != nullptr);
-
-  system_allocator_ = std::move(system_allocator);
-  min_chunk_size_ = min_chunk_size;
-  max_chunk_size_ = max_chunk_size;
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -90,7 +90,7 @@ void BuddyAllocator::Free(void* p) {
 
     // Invalidate GPU allocation from cache
     if (system_allocator_->UseGpu()) {
-      cache_.erase(block);
+      cache_.invalidate(block);
     }
     return;
   }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 49bd6cf9019e7..ecf23b77ae8ff 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
 #include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
@@ -80,7 +81,6 @@ class BuddyAllocator {
 
  private:
   // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
   MetadataCache cache_;
 
  private:
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index 1c9e87df49703..eaa97e7b4ad3d 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -1,4 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/platform/assert.h"
 
 namespace paddle {
@@ -7,10 +23,9 @@ namespace detail {
 
 void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
                        void* left_buddy, void* right_buddy) {
-  cache.store(this,
-              MemoryBlockMetadata(t, index, size - overhead(), size,
-                                  static_cast<MemoryBlock*>(left_buddy),
-                                  static_cast<MemoryBlock*>(right_buddy)));
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
 }
 
 MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
@@ -35,10 +50,10 @@ MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
 
 void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // make sure the split fits
-  assert(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(cache) >= size);
 
   // bail out if there is no room for another partition
-  if (total_size(cache) - size <= overhead()) {
+  if (total_size(cache) - size <= sizeof(Metadata)) {
     return;
   }
 
@@ -53,13 +68,13 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
   // Write the metadata for the new block
   auto new_block_right_buddy = metadata.right_buddy;
 
-  cache.store(static_cast<MemoryBlock*>(right_partition),
-              MemoryBlockMetadata(FREE_MEMORY, index(cache),
-                                  remaining_size - overhead(), remaining_size,
-                                  this, new_block_right_buddy));
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
 
   metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - overhead();
+  metadata.size = size - sizeof(Metadata);
   metadata.total_size = size;
 
   cache.store(this, metadata);
@@ -76,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  assert(type(cache) == FREE_MEMORY);
-  assert(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
 
   auto metadata = cache.load(this);
 
@@ -97,16 +112,15 @@ void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   metadata.total_size += right_buddy->total_size(cache);
 
   cache.store(this, metadata);
-  cache.store(right_buddy,
-              MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr));
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
 
 void MemoryBlock::mark_as_free(MetadataCache& cache) {
   // check for double free or corruption
-  assert(type(cache) != FREE_MEMORY);
-  assert(type(cache) != INVALID_MEMORY);
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
 
-  set_type(cache, FREE_MEMORY);
+  set_type(cache, FREE_CHUNK);
 }
 
 void MemoryBlock::set_type(MetadataCache& cache, Type t) {
@@ -130,14 +144,12 @@ size_t MemoryBlock::index(MetadataCache& cache) const {
 }
 
 void* MemoryBlock::data() const {
-  return const_cast<MemoryBlockMetadata*>(
-             reinterpret_cast<const MemoryBlockMetadata*>(this)) +
-         1;
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
 }
 
 MemoryBlock* MemoryBlock::metadata() const {
   return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const MemoryBlockMetadata*>(this) - 1));
+      reinterpret_cast<const Metadata*>(this) - 1));
 }
 
 }  // detail
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
index 2945520113ae4..a5168b519f3a3 100644
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
@@ -14,24 +14,18 @@
 
 #pragma once
 
-#include "paddle/memory/detail/meta_data.h"
-
 #include <cstddef>
-#include <unordered_map>
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// Forward Declaration
-class Metadata;
+// Forward Declarations
+class MetadataCache;
 
 /*! \brief A class used to interpret the contents of a memory block */
 class MemoryBlock {
  public:
-  // Unify the metadata format between GPU and CPU allocations
-  using MetadataCache = std::unordered_map<const MemoryBlock*, Metadata>;
-
   enum Type {
     FREE_CHUNK,    // memory is free and idle
     ARENA_CHUNK,   // memory is being occupied
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000000..189ab4fc7bb74
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    assert(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000000..3ca1020d22ead
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/*! A cache for accessing memory block meta-data that may be expensive to access
+   directly.
+
+    Note: this class exists to unify the metadata format between GPU and CPU
+   allocations.
+    It should be removed when the CPU can access all GPU allocations directly
+   via UVM.
+*/
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+ public:
+  /*! \brief Acquire any external metadata updates. */
+  void acquire(MemoryBlock*);
+
+  /*! \brief Publish any local updates externally. */
+  void release(MemoryBlock*);
+
+  /*! \brief Indicate that the specified metadata will no longer be used */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
index a3b7a9b4fed94..70c5c1f439e84 100644
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
@@ -29,6 +29,14 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
       left_buddy(l),
       right_buddy(r) {}
 
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
 template <class T>
 inline void hash_combine(std::size_t& seed, const T& v) {
   std::hash<T> hasher;
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
index ddb826571b6d6..628cf1f2e347e 100644
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
@@ -26,6 +26,7 @@ class Metadata {
  public:
   Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
            MemoryBlock* r);
+  Metadata();
 
  public:
   /*! \brief Update the guards when metadata is changed */

From d0ad0314bb868b9e0c1aa77f74ca0d2d3e8b8ef0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 16:33:18 +0800
Subject: [PATCH 19/38] FIX: glog dependency

---
 paddle/memory/detail/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 4fdabc8eebd4f..6cb6422e473da 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -12,4 +12,4 @@ cc_library(meta_cache SRCS meta_cache.cc)
 
 cc_library(memory_block SRCS memory_block.cc)
 
-cc_library(buddy_allocator SRCS buddy_allocator.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)

From ada1c20bbc2520d566b7d2bd2a56cf94cbcddd27 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 5 Jul 2017 19:16:02 +0800
Subject: [PATCH 20/38] FIX: Buddy Allocator Free with Merge feature

---
 paddle/memory/detail/buddy_allocator.cc | 33 ++++++++++++++++++++++---
 paddle/memory/detail/buddy_allocator.h  | 15 +++++++----
 paddle/memory/detail/memory_block.cc    |  4 +--
 paddle/platform/CMakeLists.txt          |  2 +-
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index aa5b6b557c5dd..9f334a7048fc6 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -89,9 +89,8 @@ void BuddyAllocator::Free(void* p) {
                             block->index(cache_));
 
     // Invalidate GPU allocation from cache
-    if (system_allocator_->UseGpu()) {
-      cache_.invalidate(block);
-    }
+    cache_.invalidate(block);
+
     return;
   }
 
@@ -104,12 +103,35 @@ void BuddyAllocator::Free(void* p) {
   if (block->has_right_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its right buddy "
                << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
+                   right_buddy});
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
   }
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
     DLOG(INFO) << "Merging this block " << block << " with its left buddy "
                << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
+                   left_buddy});
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
   }
 
   // Dumping this block into pool
@@ -167,13 +189,16 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
 
   while (1) {
     auto it = pool_.lower_bound({index, size, nullptr});
+
+    // no match chunk memory
     if (it == pool_.end()) return it;
 
     if (std::get<0>(*it) > index) {
+      // find suitable one
       if (std::get<1>(*it) >= size) {
         return it;
       }
-
+      // update and continue
       index = std::get<0>(*it);
       continue;
     }
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index ecf23b77ae8ff..2fd9c8162a98a 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,14 +42,14 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- public:
+ private:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 
- private:
-  // Tuple type: allocator index, memory size, memory address
+  // Tuple (allocator index, memory size, memory address)
   using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
   using PoolSet = std::set<IndexSizeAddress>;
 
   /*! \brief Allocate fixed-size memory from system */
@@ -57,7 +57,6 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
-
   /**
    *  \brief Find the suitable chunk from existing pool
    *
@@ -77,13 +76,19 @@ class BuddyAllocator {
   size_t max_chunk_size_;  // the maximum size of each chunk
 
  private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
   PoolSet pool_;
 
  private:
-  // Unify the metadata format between GPU and CPU allocations
+  /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
 
  private:
+  /*! Allocate CPU/GPU memory from system */
   SystemAllocator* system_allocator_;
   std::mutex mutex_;
 };
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
index eaa97e7b4ad3d..bc67bcef0fdf9 100644
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -91,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) {
 
 void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_MEMORY);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY);
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
 
   auto metadata = cache.load(this);
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 2f3d1c061e37c..0ad11f1b106be 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)

From 74691789e9e5ee782adb003642f66699603b20e2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:16:03 +0800
Subject: [PATCH 21/38] ENH: add memory unit test

---
 paddle/memory/CMakeLists.txt            |  6 ++--
 paddle/memory/detail/CMakeLists.txt     |  4 +--
 paddle/memory/detail/buddy_allocator.cc | 33 +++++++++++------
 paddle/memory/detail/buddy_allocator.h  | 27 --------------
 paddle/memory/memory.cc                 | 42 ++++++++++++++++++----
 paddle/memory/memory_test.cc            | 48 +++++++++++++++++++++++++
 paddle/platform/gpu_info.cc             |  2 +-
 7 files changed, 112 insertions(+), 50 deletions(-)
 create mode 100644 paddle/memory/memory_test.cc

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8c290712fc9fb..fac442cca56b8 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_subdirectory(detail)
 
-cc_library(memory
-    SRCS
-    memory.cc)
+cc_library(memory SRCS memory.cc)
 
 cc_library(paddle_memory
     DEPS
     memory meta_data
     meta_cache memory_block
     buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 6cb6422e473da..b9c3fc31c1523 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 9f334a7048fc6..ed2eedf9af800 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -24,10 +24,20 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
-      system_allocator_(std::move(system_allocator)) {
-  PADDLE_ASSERT(min_chunk_size > 0);
-  PADDLE_ASSERT(max_chunk_size > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
 }
 
 inline size_t align(size_t size, size_t alignment) {
@@ -62,7 +72,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it)
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
                << " at address "
                << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
@@ -142,6 +152,8 @@ void BuddyAllocator::Free(void* p) {
   // TODO(gangliao): Clean up if existing too much free memory
 }
 
+size_t BuddyAllocator::Used() { return total_used_; }
+
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
@@ -172,7 +184,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  DLOG(INFO) << " Creating and inserting new block " << p
+  DLOG(INFO) << "Creating and inserting new block " << p
              << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
@@ -211,20 +223,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
              << ") into";
   block->split(cache_, size);
 
-  DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_)
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
              << ")";
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_)
-                 << ", " << block->right_buddy(cache_)->total_size(cache_)
-                 << ")";
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert({block->right_buddy(cache_)->index(cache_),
                     block->right_buddy(cache_)->total_size(cache_),
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 2fd9c8162a98a..eeb2dc88364bb 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -93,33 +93,6 @@ class BuddyAllocator {
   std::mutex mutex_;
 };
 
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(),
-                           platform::CpuMaxChunkSize());
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] =
-          new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(),
-                             platform::GpuMaxChunkSize());
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
-
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e234a..dde6ff0ef3e54 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -22,37 +22,67 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+   return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GpuDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 void* Alloc(platform::Place pl, size_t size) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return GetCPUBuddyAllocator()->Alloc(size);
 }
 
 void Free(paddle::platform::Place pl, void* p) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
+    GetGPUBuddyAllocator(gpu_id)->Free(p);
+    return;
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+  GetCPUBuddyAllocator()->Free(p);
 }
 
 size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
     size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+    return GetGPUBuddyAllocator(gpu_id)->Used();
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return GetCPUBuddyAllocator()->Used();
 }
 
 }  // namespace memory
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000000..a98ccbb806830
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/platform/place.h"
+
+#include "gtest/gtest.h"
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 05a243c50696a..fe475d23ce08b 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -56,7 +56,7 @@ size_t GpuMaxAllocSize() {
   GpuMemoryUsage(available, total);
 
   // Reserve the rest for page tables, etc.
-  return total * FLAGS_fraction_of_gpu_memory_to_use;
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {

From 936cd1e11885daf35ae34b9fbf11a798e9e51022 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:34:51 +0800
Subject: [PATCH 22/38] FIX: code format

---
 paddle/memory/memory.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index dde6ff0ef3e54..43f2084e8dfd0 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -29,7 +29,7 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
                                    platform::CpuMinChunkSize(),
                                    platform::CpuMaxChunkSize());
   }
-   return a;
+  return a;
 }
 
 #ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.

From 5d2e8edb3ec86e13b7684cbe372650d21fe7954d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 01:37:20 +0800
Subject: [PATCH 23/38] FIX: dynamic loader deps

---
 paddle/platform/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 0ad11f1b106be..bf3e8f31abe44 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags)
+cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)

From 3ad8e364715915fba5909c137834e34f38b6e9ac Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 11:24:01 +0800
Subject: [PATCH 24/38] FIX: merge static libs with propagation dependencies

---
 cmake/generic.cmake | 51 ++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cae9524b2fe14..87d8caaec4059 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
 
-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
   foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps 
+  # also help to track dependencies.
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # Make the generated dummy source file depended on all static input
+  # libs. If input lib changes,the source file is touched
+  # which causes the desired effect (relink).
+  add_custom_command(OUTPUT ${dummyfile}
+    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    DEPENDS ${libs})
+
+  # Generate dummy staic lib
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  add_library(${TARGET_NAME} STATIC ${dummyfile})
+  target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+  foreach(lib ${libs})
+    # Get the file names of the libraries to be merged
     set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
   endforeach()
 
+  # Get the file name of the generated library
+  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
   if(APPLE) # Use OSX's libtool to merge archives
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
@@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME)
       set(objdir ${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
-
-      list(APPEND mergebases "${mergebase}")
-    endforeach()
-
-    # We need a target for the output merged library
-    add_library(${TARGET_NAME} STATIC ${mergebases})
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
-
-    foreach(lib ${libs})
       add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
+        WORKING_DIRECTORY ${objdir})
     endforeach()
 
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD

From a669bf48d966a92206c57d72258bb625b5ff2fbc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 13:38:11 +0800
Subject: [PATCH 25/38] FIX: explicit construct pool element

---
 paddle/memory/detail/buddy_allocator.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ed2eedf9af800..2cfacec46c108 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -118,8 +118,9 @@ void BuddyAllocator::Free(void* p) {
 
     if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_),
-                   right_buddy});
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
 
       // merge its right buddy to the block
       block->merge(cache_, right_buddy);
@@ -135,8 +136,8 @@ void BuddyAllocator::Free(void* p) {
 
     if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
       // Take away right buddy from pool
-      pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_),
-                   left_buddy});
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
 
       // merge the block to its left buddy
       left_buddy->merge(cache_, block);
@@ -147,7 +148,8 @@ void BuddyAllocator::Free(void* p) {
   // Dumping this block into pool
   DLOG(INFO) << "Inserting free block (" << block << ", "
              << block->total_size(cache_) << ")";
-  pool_.insert({block->index(cache_), block->total_size(cache_), block});
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // TODO(gangliao): Clean up if existing too much free memory
 }
@@ -193,14 +195,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
-  return pool_.insert({index, max_chunk_size_, p}).first;
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
   size_t index = 0;
 
   while (1) {
-    auto it = pool_.lower_bound({index, size, nullptr});
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
 
     // no match chunk memory
     if (it == pool_.end()) return it;
@@ -237,9 +239,10 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
       DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
                  << block->right_buddy(cache_)->total_size(cache_) << ")";
 
-      pool_.insert({block->right_buddy(cache_)->index(cache_),
-                    block->right_buddy(cache_)->total_size(cache_),
-                    block->right_buddy(cache_)});
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
     }
   }
 

From adf8c95b62fc5ef1f608bc06dce32bb4b396828c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 15:40:22 +0800
Subject: [PATCH 26/38] FIX: propagation dependencies under linux

---
 cmake/generic.cmake | 68 ++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 87d8caaec4059..3900ea26048d3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,36 +104,32 @@ function(merge_static_libs TARGET_NAME)
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
 
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps 
-  # also help to track dependencies.
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  # Make the generated dummy source file depended on all static input
-  # libs. If input lib changes,the source file is touched
-  # which causes the desired effect (relink).
-  add_custom_command(OUTPUT ${dummyfile}
-    COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
-    DEPENDS ${libs})
-
-  # Generate dummy staic lib
-  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-  add_library(${TARGET_NAME} STATIC ${dummyfile})
-  target_link_libraries(${TARGET_NAME} ${libs_deps})
+  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps 
+    # also help to track dependencies.
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
-  foreach(lib ${libs})
-    # Get the file names of the libraries to be merged
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-  endforeach()
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
 
-  # Get the file name of the generated library
-  set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    # Generate dummy staic lib
+    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
 
-  if(APPLE) # Use OSX's libtool to merge archives
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
     foreach(lib ${libs})
       set(objlistfile ${lib}.objlist) # list of objects in the input library
       set(objdir ${lib}.objdir)
@@ -148,13 +144,27 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} ru ${outlibfile} *.o 
-        WORKING_DIRECTORY ${objdir})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
+
+      list(APPEND mergebases "${mergebase}")
     endforeach()
 
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+
+    # Get the file name of the generated library
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
+    endforeach()
   endif()
 endfunction(merge_static_libs)
 

From ddfa6cf0d1fe91f8bf2e1d55841afee9e30d1859 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 6 Jul 2017 17:07:04 +0800
Subject: [PATCH 27/38] FIX: remove boost from memory folder

---
 paddle/memory/memory.cc | 56 +++++++++++++++++++----------------------
 paddle/memory/memory.h  | 11 +++++---
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 43f2084e8dfd0..def580f7a4b22 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -32,7 +32,22 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() {
   return a;
 }
 
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
@@ -49,41 +64,22 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   return as[gpu_id];
 }
 
-#endif  // PADDLE_ONLY_CPU
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Alloc(size);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Alloc(size);
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    GetGPUBuddyAllocator(gpu_id)->Free(p);
-    return;
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  GetCPUBuddyAllocator()->Free(p);
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return GetGPUBuddyAllocator(gpu_id)->Used();
-  }
 #endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Used();
-}
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade65e..2d6f4fd2a08ee 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);
 
 }  // namespace memory
 }  // namespace paddle

From 199b5fcb45c69560de1b24b3147f5e7db309abe3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 10 Jul 2017 11:22:17 +0800
Subject: [PATCH 28/38] ENH: refine code comments

---
 paddle/memory/detail/buddy_allocator.h   |  3 ++-
 paddle/memory/detail/meta_cache.h        | 25 +++++++++---------------
 paddle/memory/detail/system_allocator.cc |  4 ++--
 paddle/memory/detail/system_allocator.h  |  6 +++---
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index eeb2dc88364bb..a89dd8eb7c19d 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -42,7 +42,7 @@ class BuddyAllocator {
   void Free(void*);
   size_t Used();
 
- private:
+ public:
   // Disable copy and assignment.
   BuddyAllocator(const BuddyAllocator&) = delete;
   BuddyAllocator& operator=(const BuddyAllocator&) = delete;
@@ -57,6 +57,7 @@ class BuddyAllocator {
 
   /*! \brief If existing chunks are not suitable, refill pool */
   PoolSet::iterator RefillPool();
+
   /**
    *  \brief Find the suitable chunk from existing pool
    *
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index 3ca1020d22ead..ca0789779e273 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -23,14 +23,14 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-/*! A cache for accessing memory block meta-data that may be expensive to access
-   directly.
-
-    Note: this class exists to unify the metadata format between GPU and CPU
-   allocations.
-    It should be removed when the CPU can access all GPU allocations directly
-   via UVM.
-*/
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
 class MetadataCache {
  public:
   MetadataCache(bool uses_gpu);
@@ -42,14 +42,7 @@ class MetadataCache {
   /*! \brief Store the associated metadata for the specified memory block. */
   void store(MemoryBlock*, const Metadata&);
 
- public:
-  /*! \brief Acquire any external metadata updates. */
-  void acquire(MemoryBlock*);
-
-  /*! \brief Publish any local updates externally. */
-  void release(MemoryBlock*);
-
-  /*! \brief Indicate that the specified metadata will no longer be used */
+  /*! \brief Indicate that the specified metadata will no longer be used. */
   void invalidate(MemoryBlock*);
 
  public:
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 75a2c91ef938e..1579174b1a6ff 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -60,7 +60,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
   free(p);
 }
 
-bool CPUAllocator::UseGpu() { return false; }
+bool CPUAllocator::UseGpu() const { return false; }
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -133,7 +133,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
   }
 }
 
-bool GPUAllocator::UseGpu() { return true; }
+bool GPUAllocator::UseGpu() const { return true; }
 
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 555061a533ffe..04efcd9709445 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -32,14 +32,14 @@ class SystemAllocator {
   virtual ~SystemAllocator() {}
   virtual void* Alloc(size_t& index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
-  virtual bool UseGpu() = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
@@ -47,7 +47,7 @@ class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu();
+  virtual bool UseGpu() const;
 
  private:
   size_t gpu_alloc_size_ = 0;

From d4017cadcd0fa07d8874e052ffa91700ebb32a05 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:18:38 +0800
Subject: [PATCH 29/38] ENH: Add auto-free if allocate too much

---
 paddle/memory/detail/buddy_allocator.cc | 69 ++++++++++++++++++++++++-
 paddle/memory/detail/buddy_allocator.h  |  3 ++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 3f630973e906c..27c1b4033b53b 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -152,7 +152,7 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
   // Clean up if existing too much free memory
-  
+
   // Prefer freeing fallback allocation first
   CleanIdleFallBackAlloc();
 
@@ -198,6 +198,12 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
 
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
   total_free_ += max_chunk_size_;
 
   // dump the block into pool
@@ -256,9 +262,68 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 }
 
 void BuddyAllocator::CleanIdleFallBackAlloc() {
-  
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
 }
 
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
+}
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 14ee1fa07c031..4fa3fb0ee5f82 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -94,6 +94,9 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;

From 6a3b8416df124153d4a1fd1f8f559107578ed58e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 15:20:43 +0800
Subject: [PATCH 30/38] FIX: clang-format

---
 paddle/memory/memory_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index fed7444798fe2..9fdcd03b1a664 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -33,7 +33,7 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void*> ps;
+  std::vector<void *> ps;
   ps.reserve(8);
 
   for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {

From 383b96f32c60ec542819c62b4e09009cae9afc9d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 11 Jul 2017 16:26:58 +0800
Subject: [PATCH 31/38] FIX: merge conflicts

---
 paddle/memory/detail/meta_cache.cc | 2 +-
 paddle/memory/memory.cc            | 2 +-
 paddle/platform/CMakeLists.txt     | 2 +-
 paddle/platform/device_context.h   | 3 ++-
 paddle/platform/gpu_info.cc        | 4 ++--
 paddle/platform/gpu_info.h         | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
index 189ab4fc7bb74..30ff80e7bac0b 100644
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
@@ -25,7 +25,7 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 Metadata MetadataCache::load(const MemoryBlock* block) {
   if (uses_gpu_) {
     auto existing_metadata = cache_.find(block);
-    assert(existing_metadata->second.check_guards());
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
     return existing_metadata->second;
   } else {
     PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index def580f7a4b22..430ce98bfc145 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -52,7 +52,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   static detail::BuddyAllocator** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GpuDeviceCount();
+    int gpu_num = platform::GetDeviceCount();
     as = new detail::BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4b3f55b3c7724..d16c747aee2f9 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,4 +8,4 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3)
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 gpu_info)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e12060b..02194581d1dcb 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include "paddle/platform/place.h"
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index fe475d23ce08b..9b917f9d35f7f 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -23,11 +23,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
 namespace paddle {
 namespace platform {
 
-int GpuDeviceCount() {
+int GetDeviceCount() {
   int count;
   throw_on_error(
       cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount");
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
   return count;
 }
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 81ee5f6e0a95e..79e71956bd32e 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace platform {
 
 //! Get the total number of GPU devices in system.
-int GpuDeviceCount();
+int GetDeviceCount();
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

From ff98e3c1ece983403ebdfa57f07d3bdf58f85647 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 14:26:48 +0800
Subject: [PATCH 32/38] ENH: Remove comments

---
 paddle/memory/detail/system_allocator.h | 12 +++++-------
 paddle/platform/gpu_info.cc             |  1 -
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 04efcd9709445..82ba322e05757 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,13 +20,11 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 9b917f9d35f7f..a1383d3524aed 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -65,7 +65,6 @@ size_t GpuMinChunkSize() {
 }
 
 size_t GpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
   size_t total = 0;
   size_t available = 0;
 

From 00572aa451d44ccb32b1c59a59241d7000c68fda Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 13 Jul 2017 19:14:09 +0800
Subject: [PATCH 33/38] Add memory alignment test

---
 paddle/memory/memory_test.cc | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 9fdcd03b1a664..4c9b3311bb12c 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+#include "gtest/gtest.h"
 #include "paddle/platform/place.h"
 
-#include "gtest/gtest.h"
+template <typename T>
+inline bool is_aligned(T *p, size_t n = alignof(T)) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+}
 
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
@@ -36,11 +40,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   std::vector<void *> ps;
   ps.reserve(8);
 
-  for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
     ps.emplace_back(paddle::memory::Alloc(cpu, size));
   }
 
   for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
     paddle::memory::Free(cpu, p);
   }
 }
@@ -60,4 +66,21 @@ TEST(BuddyAllocator, GPUAllocation) {
   paddle::memory::Free(gpu, p);
 }
 
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::vector<void *> ps;
+  ps.reserve(8);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p, 32), true);
+    paddle::memory::Free(gpu, p);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU

From ab5fe1e9071ef67850683442035f27c6c602e126 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 11:52:03 +0800
Subject: [PATCH 34/38] ENH: memory test: check alignment and memory size

---
 paddle/memory/memory_test.cc | 80 ++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 4c9b3311bb12c..458c8b2e24fa2 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -13,14 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
-#include "gtest/gtest.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
-template <typename T>
-inline bool is_aligned(T *p, size_t n = alignof(T)) {
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p, const size_t n) {
   return 0 == (reinterpret_cast<uintptr_t>(p) % n);
 }
 
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void update_size(size_t &total_size, const size_t size) {}
+
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 
@@ -37,17 +59,33 @@ TEST(BuddyAllocator, CPUAllocation) {
 TEST(BuddyAllocator, CPUMultAlloc) {
   paddle::platform::CPUPlace cpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(cpu, size));
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(cpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
   }
 }
 
@@ -69,17 +107,33 @@ TEST(BuddyAllocator, GPUAllocation) {
 TEST(BuddyAllocator, GPUMultAlloc) {
   paddle::platform::GPUPlace gpu;
 
-  std::vector<void *> ps;
-  ps.reserve(8);
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
 
   for (auto size :
        {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps.emplace_back(paddle::memory::Alloc(gpu, size));
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p, 32), true);
-    paddle::memory::Free(gpu, p);
+    EXPECT_EQ(is_aligned(p.first, 32), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
   }
 }
 

From 21b7915d9122d29bdb7506ab2e30049653ccf52a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 12:03:09 +0800
Subject: [PATCH 35/38] Fix condition compile

---
 paddle/memory/memory_test.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 458c8b2e24fa2..e13cbabb268a8 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -34,13 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-size_t align(size_t size, paddle::platform::GPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
 void update_size(size_t &total_size, const size_t size) {}
 
 TEST(BuddyAllocator, CPUAllocation) {
@@ -91,6 +84,13 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 
 #ifndef PADDLE_ONLY_CPU
 
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
 TEST(BuddyAllocator, GPUAllocation) {
   void *p = nullptr;
 

From ea916c84874e4a036650bccb2b2eae142a6c36ae Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:00:03 +0800
Subject: [PATCH 36/38] Fix: alignment metric

---
 paddle/memory/memory_test.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index e13cbabb268a8..2a2cb98a31b63 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 
 inline bool is_aligned(void const *p, const size_t n) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) % n);
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
 size_t align(size_t size, paddle::platform::CPUPlace place) {
@@ -34,8 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) {
   return remaining == 0 ? size : size + (alignment - remaining);
 }
 
-void update_size(size_t &total_size, const size_t size) {}
-
 TEST(BuddyAllocator, CPUAllocation) {
   void *p = nullptr;
 

From 033523ea9d01bb0e860dbbfd6c5bab72d2c5b149 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 13:02:17 +0800
Subject: [PATCH 37/38] update

---
 paddle/memory/memory_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 2a2cb98a31b63..53cc63a098d08 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <unordered_map>
 
-inline bool is_aligned(void const *p, const size_t n) {
+inline bool is_aligned(void const *p) {
   return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
 
@@ -68,7 +68,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(cpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk
@@ -123,7 +123,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 
   for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first, 32), true);
+    EXPECT_EQ(is_aligned(p.first), true);
     paddle::memory::Free(gpu, p.first);
 
     // Buddy Allocator doesn't manage too large memory chunk

From 03b3d0d8a8b8e90f997e1a2cec49bb04486adc8a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 14 Jul 2017 20:12:35 +0800
Subject: [PATCH 38/38] Follow comments

---
 paddle/platform/cpu_info.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
index 3da04420e57c3..1905cfeee6303 100644
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -54,8 +54,8 @@ size_t CpuMaxAllocSize() {
 }
 
 size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
 }
 
 size_t CpuMaxChunkSize() {