From 0e6ddcc7bc63eb6ddfe5f12f4d9060625befe41a Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 10:01:10 +0800 Subject: [PATCH 01/38] ENH: Add GPU throw error --- paddle/platform/error.h | 87 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 paddle/platform/error.h diff --git a/paddle/platform/error.h b/paddle/platform/error.h new file mode 100644 index 0000000000000..93424bb610965 --- /dev/null +++ b/paddle/platform/error.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include + +#ifndef PADDLE_ONLY_CPU + +#include +#include +#include +#include +#include + +#endif // PADDLE_ONLY_CPU + +namespace paddle { +namespace platform { + +#ifndef PADDLE_ONLY_CPU + +inline void throw_on_error(cudaError_t e, const char* message) { + if (e) { + throw thrust::system_error(e, thrust::cuda_category(), message); + } +} + +inline void throw_on_error(curandStatus_t stat, const char* message) { + if (stat != CURAND_STATUS_SUCCESS) { + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + message); + } +} + +inline void throw_on_error(cudnnStatus_t stat, const char* message) { + std::stringstream ss; + if (stat == CUDNN_STATUS_SUCCESS) { + return; + } else { + ss << cudnnGetErrorString(stat); + ss << ", " << message; + throw std::runtime_error(ss.str()); + } +} + +inline void throw_on_error(cublasStatus_t stat, const char* message) { + std::stringstream ss; + if (stat == CUBLAS_STATUS_SUCCESS) { + return; + } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + ss << "CUBLAS: not initialized"; + } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { + ss << "CUBLAS: alloc failed"; + } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { + ss << "CUBLAS: invalid value"; + } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { + ss << "CUBLAS: arch mismatch"; + } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { + ss << "CUBLAS: mapping error"; + } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { + ss << "CUBLAS: execution failed"; + } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { + ss << "CUBLAS: internal error"; + } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { + ss << "CUBLAS: not supported"; + } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { + ss << "CUBLAS: license error"; + } + ss << ", " << message; + throw std::runtime_error(ss.str()); +} + +inline void throw_on_error(cublasStatus_t stat) { + const char* message = ""; + throw_on_error(stat, message); +} + +#endif // PADDLE_ONLY_CPU + +inline void throw_on_error(int stat, const char* message) { + if (stat) { + throw std::runtime_error(message + (", stat = " + std::to_string(stat))); + } +} + +} // namespace platform +} // namespace paddle From d3b77a5bc053b77309ecc094450e755604217674 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 13:56:38 +0800 Subject: [PATCH 02/38] ENH: Add Gpu info --- paddle/platform/gpu_info.cc | 49 +++++++++++++++++++++++++++++++++++++ paddle/platform/gpu_info.h | 36 +++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 paddle/platform/gpu_info.cc create mode 100644 paddle/platform/gpu_info.h diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc new file mode 100644 index 0000000000000..4208d83078c90 --- /dev/null +++ b/paddle/platform/gpu_info.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/gpu_info.h" +#include "gflags/gflags.h" +#include "paddle/platform/error.h" + +DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, + "Default use 95% of GPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +int GpuDeviceCount() { + int count; + throw_on_error( + cudaGetDeviceCount(&count), + "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount"); + return count; +} + +void GpuMemoryUsage(size_t& available, size_t& total) { + throw_on_error(cudaMemGetInfo(&available, &total), + "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); +} + +size_t GpuMaxAllocSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + return total * FLAGS_fraction_of_gpu_memory_to_use; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h new file mode 100644 index 0000000000000..174f093b435c9 --- /dev/null +++ b/paddle/platform/gpu_info.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifndef PADDLE_ONLY_CPU + +#include + +namespace paddle { +namespace platform { + +//! Get the total number of GPU devices in system. +int GpuDeviceCount(); + +//!Get the memory usage of current GPU device. +void GpuMemoryUsage(size_t& available, size_t& total); + +//! Get the maximum allocation size of current GPU device. +size_t GpuMaxAllocSize(); + +} // namespace platform +} // namespace paddle + +#endif // PADDLE_ONLY_CPU From b29923f902dc6da1416a94bc153448f1546e62b2 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 13:56:57 +0800 Subject: [PATCH 03/38] ENH: Add CPU info --- paddle/platform/cpu_info.cc | 55 +++++++++++++++++++++++++++++++++++++ paddle/platform/cpu_info.h | 26 ++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 paddle/platform/cpu_info.cc create mode 100644 paddle/platform/cpu_info.h diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc new file mode 100644 index 0000000000000..deff76502e8cb --- /dev/null +++ b/paddle/platform/cpu_info.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/cpu_info.h" + +#ifdef __APPLE__ +#include +#include +#else +#include +#endif + +#include "gflags/gflags.h" +#include "paddle/platform/error.h" + +DEFINE_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +inline size_t CpuTotalPhysicalMemory() { +#ifdef __APPLE__ + int mib[2]; + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + int64_t size = 0; + size_t len = sizeof(size); + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; + return 0L; +#else + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +#endif +} + +size_t CpuTotalMemory() { + return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h new file mode 100644 index 0000000000000..3b768589e150f --- /dev/null +++ b/paddle/platform/cpu_info.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { + +//! Get the total memory on host machine. +size_t CpuTotalMemory(); + +} // namespace platform +} // namespace paddle From 169022d0148a77cd10f16a82e841a75750e7e173 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 14:04:47 +0800 Subject: [PATCH 04/38] FIX: Improve fallback gpu allocator --- paddle/memory/detail/CMakeLists.txt | 4 +- paddle/memory/detail/system_allocator.cc | 64 ++++++++++++++----- paddle/memory/detail/system_allocator.h | 15 +++-- paddle/memory/detail/system_allocator_test.cc | 14 ++-- paddle/platform/CMakeLists.txt | 4 ++ paddle/platform/cpu_info_test.cc | 18 ++++++ paddle/platform/cuda.h | 40 ------------ 7 files changed, 85 insertions(+), 74 deletions(-) create mode 100644 paddle/platform/cpu_info_test.cc delete mode 100644 paddle/platform/cuda.h diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 72d3749ad789e..6caa97a76bbfd 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,6 +1,8 @@ if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) - nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) + nv_test(system_allocator_test + SRCS system_allocator_test.cc + DEPS system_allocator gpu_info gflags) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 50bec926f83de..332ff062d4784 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -13,32 +13,39 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/detail/system_allocator.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/error.h" +#include "paddle/platform/gpu_info.h" #include // for malloc and free #include // for mlock and munlock #include "gflags/gflags.h" -#include "paddle/platform/assert.h" -#include "paddle/platform/cuda.h" // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, false, - "If set, allocate cpu/gpu pinned memory."); +DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory."); namespace paddle { namespace memory { namespace detail { -void* CPUAllocator::Alloc(size_t size) { +void* CPUAllocator::Alloc(size_t& index, size_t size) { // According to http://www.cplusplus.com/reference/cstdlib/malloc/, // malloc might not return nullptr if size is zero, but the returned // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; + if (FLAGS_use_pinned_memory) { + void* p = malloc(size); + if (p != nullptr) { + mlock(p, size); + } + } + void* p = malloc(size); if (p != nullptr && FLAGS_use_pinned_memory) { mlock(p, size); @@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) { return p; } -void CPUAllocator::Free(void* p, size_t size) { +void CPUAllocator::Free(void* p, size_t size, size_t index) { if (p != nullptr && FLAGS_use_pinned_memory) { munlock(p, size); } @@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) { #ifndef PADDLE_ONLY_CPU -void* GPUAllocator::Alloc(size_t size) { +void* GPUAllocator::Alloc(size_t& index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. - if (size <= 0) { - return nullptr; - } + if (size <= 0) return nullptr; + size_t available = 0; + size_t capacity = 0; + paddle::platform::GpuMemoryUsage(available, capacity); + + // Reserve memory for page tables, etc. + size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); + size_t remaining = available > reserving ? available - reserving : 0; + + // If remaining size no less than expected size, using general + // cudaMalloc to allocate GPU memory. void* p = 0; - cudaError_t result = - FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); - if (result != cudaSuccess) { - cudaGetLastError(); // clear error if there is any. + if (size <= remaining) { + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + index = 0; + total_alloc_size_ += size; + return p; + } } - return result == cudaSuccess ? p : nullptr; + + // If remaining size less than expected size or cudaMalloc failed, + // cudaMallocHost will be considered as a fallback allocator. + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + total_alloc_size_ += size; + return p; + } + + return nullptr; } -void GPUAllocator::Free(void* p, size_t size) { +void GPUAllocator::Free(void* p, size_t size, size_t index) { // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. - cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p); + PADDLE_ASSERT(total_alloc_size_ >= size); + total_alloc_size_ -= size; + cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { platform::throw_on_error(err, "cudaFree{Host} failed"); } diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 184b383f7f782..e15302ce4f0ae 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -30,21 +30,24 @@ namespace detail { class SystemAllocator { public: virtual ~SystemAllocator() {} - virtual void* Alloc(size_t size) = 0; - virtual void Free(void* p, size_t size) = 0; + virtual void* Alloc(size_t& index, size_t size) = 0; + virtual void Free(void* p, size_t size, size_t index) = 0; }; class CPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); }; #ifndef PADDLE_ONLY_CPU class GPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + + private: + size_t total_alloc_size_ = 0; }; #endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index 9bd5706a4e4d1..ba44e06ddb68e 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory); void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { bool freed = false; { - void* p = a.Alloc(size); + size_t index; + void* p = a.Alloc(index, size); if (size > 0) { EXPECT_NE(p, nullptr); } else { @@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { int* i = static_cast(p); std::shared_ptr ptr(i, [&](void* p) { freed = true; - a.Free(p, size); + a.Free(p, size, index); }); } EXPECT_TRUE(freed); @@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) { } #ifndef PADDLE_ONLY_CPU -TEST(GPUAllocator, NoStaging) { - FLAGS_use_pinned_memory = false; - paddle::memory::detail::GPUAllocator a; - TestAllocator(a, 2048); - TestAllocator(a, 0); -} -TEST(GPUAllocator, Staging) { - FLAGS_use_pinned_memory = true; +TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a; TestAllocator(a, 2048); TestAllocator(a, 0); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 7abe2ab89e079..17342356d6018 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,3 +1,7 @@ +cc_library(cpu_info SRCS cpu_info.cc) +cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags) + +nv_library(gpu_info SRCS gpu_info.cc) nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc new file mode 100644 index 0000000000000..5b7ce7c763e39 --- /dev/null +++ b/paddle/platform/cpu_info_test.cc @@ -0,0 +1,18 @@ +#include "paddle/platform/cpu_info.h" + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_double(fraction_of_cpu_memory_to_use); + +TEST(CpuMemoryUsage, Print) { + std::stringstream ss; + size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024; + ss << std::to_string( + static_cast(FLAGS_fraction_of_cpu_memory_to_use * 100)) + << "% of CPU Memory Usage: " << mem_size << " GB"; + std::cout << ss.str(); +} diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h deleted file mode 100644 index 8fe891f9ce6c3..0000000000000 --- a/paddle/platform/cuda.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PADDLE_ONLY_CPU - -#include -#include - -namespace paddle { -namespace platform { - -inline void throw_on_error(cudaError_t e, const char* message) { - if (e) { - throw thrust::system_error(e, thrust::cuda_category(), message); - } -} - -int GetDeviceCount(void) { - int count; - throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); - return count; -} - -} // namespace platform -} // namespace paddle - -#endif // PADDLE_ONLY_CPU From e6c14f7e000d047cf3d3a1e18e2a13e3349b1ff9 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 16:30:03 +0800 Subject: [PATCH 05/38] ENH: Polish cpu info interface --- paddle/platform/CMakeLists.txt | 3 +- paddle/platform/cpu_info.cc | 14 +++++++- paddle/platform/cpu_info.h | 10 ++++-- paddle/platform/cpu_info_test.cc | 13 ++++--- paddle/platform/cuda_test.cu | 59 -------------------------------- 5 files changed, 30 insertions(+), 69 deletions(-) delete mode 100644 paddle/platform/cuda_test.cu diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index d0bedf6ba921a..969c91985d069 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,8 +1,7 @@ cc_library(cpu_info SRCS cpu_info.cc) -cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags) +cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog) nv_library(gpu_info SRCS gpu_info.cc) -nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc index deff76502e8cb..3da04420e57c3 100644 --- a/paddle/platform/cpu_info.cc +++ b/paddle/platform/cpu_info.cc @@ -47,9 +47,21 @@ inline size_t CpuTotalPhysicalMemory() { #endif } -size_t CpuTotalMemory() { +size_t CpuMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); } +size_t CpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t CpuMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. + return CpuMaxAllocSize() / 32; +} + } // namespace platform } // namespace paddle diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h index 3b768589e150f..8df7c7b4bca5b 100644 --- a/paddle/platform/cpu_info.h +++ b/paddle/platform/cpu_info.h @@ -19,8 +19,14 @@ limitations under the License. */ namespace paddle { namespace platform { -//! Get the total memory on host machine. -size_t CpuTotalMemory(); +//! Get the maximum allocation size for a machine. +size_t CpuMaxAllocSize(); + +//! Get the minimum chunk size for buddy allocator. +size_t CpuMinChunkSize(); + +//! Get the maximum chunk size for buddy allocator. +size_t CpuMaxChunkSize(); } // namespace platform } // namespace paddle diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc index 5b7ce7c763e39..8fb195aa7c0a4 100644 --- a/paddle/platform/cpu_info_test.cc +++ b/paddle/platform/cpu_info_test.cc @@ -1,18 +1,21 @@ #include "paddle/platform/cpu_info.h" +#include "paddle/string/printf.h" #include #include #include "gflags/gflags.h" +#include "glog/logging.h" #include "gtest/gtest.h" DECLARE_double(fraction_of_cpu_memory_to_use); TEST(CpuMemoryUsage, Print) { std::stringstream ss; - size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024; - ss << std::to_string( - static_cast(FLAGS_fraction_of_cpu_memory_to_use * 100)) - << "% of CPU Memory Usage: " << mem_size << " GB"; - std::cout << ss.str(); + size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024; + float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100; + + std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n", + use_percent, memory_size) + << std::endl; } diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu deleted file mode 100644 index 4067dda2f19f7..0000000000000 --- a/paddle/platform/cuda_test.cu +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include "gtest/gtest.h" - -#define CHECK_ERR(x) \ - if (x != cudaSuccess) { \ - fprintf(stderr, \ - "%s in %s at line %d\n", \ - cudaGetErrorString(err), \ - __FILE__, \ - __LINE__); \ - exit(-1); \ - } - -__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < n) { - d_C[i] = d_A[i] + d_B[i]; - } -} - -TEST(Cuda, Equality) { - int n = 10; - // Memory allocation for h_A, h_B and h_C (in the host) - float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0}; - float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0}; - float h_C[10]; - float *d_A, *d_B, *d_C; - cudaError_t err; - // Memory allocation for d_A, d_B and d_C (in the device) - err = cudaMalloc((void **)&d_A, sizeof(float) * n); - CHECK_ERR(err); - - err = cudaMalloc((void **)&d_B, sizeof(float) * n); - CHECK_ERR(err); - - err = cudaMalloc((void **)&d_C, sizeof(float) * n); - CHECK_ERR(err); - - // Copying memory to device - err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice); - CHECK_ERR(err); - - err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice); - CHECK_ERR(err); - - // Calling the kernel - vecAdd<<>>(d_A, d_B, d_C, n); - - // Copying results back to host - err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost); - CHECK_ERR(err); - - EXPECT_EQ(h_C[0], 1.0); - for (int i = 1; i < n - 1; ++i) { - EXPECT_EQ(h_C[i], 11.0); - } - EXPECT_EQ(h_C[9], 1.0); -} From 6e7209f0584f73eb22313d98c676333379736d1e Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 16:30:32 +0800 Subject: [PATCH 06/38] ENH: Add gpu info interface --- paddle/platform/gpu_info.cc | 25 +++++++++++++++++++++++++ paddle/platform/gpu_info.h | 6 ++++++ 2 files changed, 31 insertions(+) diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 4208d83078c90..d6c6fe34ef41e 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -42,8 +42,33 @@ size_t GpuMaxAllocSize() { GpuMemoryUsage(available, total); + // Reserve the rest for page tables, etc. return total * FLAGS_fraction_of_gpu_memory_to_use; } +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserving the rest memory for page tables, etc. + size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total; + + // If available less than minimum chunk size, no usable memory exists. + available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(); + + // If available less than reserving, no usable memory exists. + size_t usable = std::max(available, reserving) - reserving; + + return usable; +} + } // namespace platform } // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index 174f093b435c9..d7bf0f4093ef3 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -30,6 +30,12 @@ void GpuMemoryUsage(size_t& available, size_t& total); //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); +//! Get the minimum chunk size for GPU buddy allocator. +size_t GpuMinChunkSize(); + +//! Get the maximum chunk size for GPU buddy allocator. +size_t GpuMaxChunkSize(); + } // namespace platform } // namespace paddle From 464886bf56cd91ffcd6617390d62dbd13c90a093 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 16:31:05 +0800 Subject: [PATCH 07/38] FIX: fix typo in piece.h --- paddle/string/piece.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/string/piece.h b/paddle/string/piece.h index db7c3e69804a6..0272529d1c9b2 100644 --- a/paddle/string/piece.h +++ b/paddle/string/piece.h @@ -35,7 +35,7 @@ class Piece { // We provide non-explicit singleton constructors so users can // pass in a "const char*" or a "string" wherever a "Piece" - // is expected. These contructors ensure that if data_ is NULL, + // is expected. These constructors ensure that if data_ is NULL, // size_ is 0. Piece(); Piece(const char* d, size_t n); From 26cd0bb5a59d913f8c216ceee0c6abb46317e31e Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 19:13:24 +0800 Subject: [PATCH 08/38] ENH: count allocated fallback size for performance --- paddle/memory/detail/system_allocator.cc | 52 +++++++++++++------ paddle/memory/detail/system_allocator.h | 3 +- .../paddle/trainer_config_helpers/networks.py | 4 +- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 332ff062d4784..2b0fbfa87e22a 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -39,22 +39,22 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; - if (FLAGS_use_pinned_memory) { - void* p = malloc(size); - if (p != nullptr) { - mlock(p, size); - } - } + index = 0; // unlock memory void* p = malloc(size); - if (p != nullptr && FLAGS_use_pinned_memory) { - mlock(p, size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + index = 1; + mlock(p, size); // lock memory + } } + return p; } void CPUAllocator::Free(void* p, size_t size, size_t index) { - if (p != nullptr && FLAGS_use_pinned_memory) { + if (p != nullptr && index == 1) { munlock(p, size); } free(p); @@ -73,26 +73,34 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { // Reserve memory for page tables, etc. size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); - size_t remaining = available > reserving ? available - reserving : 0; + size_t usable = available > reserving ? available - reserving : 0; // If remaining size no less than expected size, using general // cudaMalloc to allocate GPU memory. void* p = 0; - if (size <= remaining) { + if (size <= usable) { cudaError_t result = cudaMalloc(&p, size); if (result == cudaSuccess) { index = 0; - total_alloc_size_ += size; + gpu_alloc_size_ += size; return p; } } // If remaining size less than expected size or cudaMalloc failed, // cudaMallocHost will be considered as a fallback allocator. + // + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host fallback allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + cudaError_t result = cudaMallocHost(&p, size); if (result == cudaSuccess) { index = 1; - total_alloc_size_ += size; + fallback_alloc_size_ += size; return p; } @@ -100,16 +108,26 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { } void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. - PADDLE_ASSERT(total_alloc_size_ >= size); - total_alloc_size_ -= size; - cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { - platform::throw_on_error(err, "cudaFree{Host} failed"); + platform::throw_on_error(err, + "cudaFree{Host} failed in GPUAllocator::Free."); } } diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index e15302ce4f0ae..7093c42967162 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -47,7 +47,8 @@ class GPUAllocator : public SystemAllocator { virtual void Free(void* p, size_t size, size_t index); private: - size_t total_alloc_size_ = 0; + size_t gpu_alloc_size_ = 0; + size_t fallback_alloc_size_ = 0; }; #endif // PADDLE_ONLY_CPU diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 67154a8d7d366..1bf59ed4840ae 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1381,7 +1381,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(* [l.name for l in layers]) + Inputs(*[l.name for l in layers]) def outputs(layers, *args): @@ -1424,7 +1424,7 @@ def __dfs_travel__(layer, assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(* [l.name for l in layers]) + Outputs(*[l.name for l in layers]) return # just return outputs. if len(layers) != 1: From fb51c3dc895b78df966dd0d9713657289b1986b3 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 19:57:10 +0800 Subject: [PATCH 09/38] FIX: add compile dependency gflags --- paddle/platform/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 969c91985d069..5cbe491b2b57f 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,7 +1,7 @@ -cc_library(cpu_info SRCS cpu_info.cc) +cc_library(cpu_info SRCS cpu_info.cc DEPS gflags) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags glog) -nv_library(gpu_info SRCS gpu_info.cc) +nv_library(gpu_info SRCS gpu_info.cc DEPS gflags) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) From 275e5b7d42903ea3c9bf4e4fed3f9eab45c727bf Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 3 Jul 2017 11:12:18 +0800 Subject: [PATCH 10/38] FIX: yapf format version --- python/paddle/trainer_config_helpers/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index f0b6625dc3736..b77932ce5f094 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1395,7 +1395,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(*[l.name for l in layers]) + Inputs(* [l.name for l in layers]) def outputs(layers, *args): @@ -1438,7 +1438,7 @@ def __dfs_travel__(layer, assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(*[l.name for l in layers]) + Outputs(* [l.name for l in layers]) return # just return outputs. if len(layers) != 1: From 89110fd2660098bc949a1f13f7b53515e0c931a3 Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 3 Jul 2017 19:51:32 +0800 Subject: [PATCH 11/38] ENH: Add useGpu in system allocator --- paddle/memory/detail/system_allocator.cc | 4 ++++ paddle/memory/detail/system_allocator.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 2b0fbfa87e22a..75a2c91ef938e 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -60,6 +60,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { free(p); } +bool CPUAllocator::UseGpu() { return false; } + #ifndef PADDLE_ONLY_CPU void* GPUAllocator::Alloc(size_t& index, size_t size) { @@ -131,6 +133,8 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { } } +bool GPUAllocator::UseGpu() { return true; } + #endif // PADDLE_ONLY_CPU } // namespace detail diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 7093c42967162..f3bbfef843542 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -32,12 +32,14 @@ class SystemAllocator { virtual ~SystemAllocator() {} virtual void* Alloc(size_t& index, size_t size) = 0; virtual void Free(void* p, size_t size, size_t index) = 0; + virtual bool UseGpu() = 0; }; class CPUAllocator : public SystemAllocator { public: virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu(); }; #ifndef PADDLE_ONLY_CPU @@ -45,7 +47,7 @@ class GPUAllocator : public SystemAllocator { public: virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); - + virtual bool UseGpu(); private: size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; From 929f9cbdff08090a222495db7db601f164cebb8c Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 3 Jul 2017 19:52:04 +0800 Subject: [PATCH 12/38] ENH: Add Metadata for memory block --- paddle/memory/detail/metadata.cc | 62 ++++++++++++++++++++++++++++++++ paddle/memory/detail/metadata.h | 53 +++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 paddle/memory/detail/metadata.cc create mode 100644 paddle/memory/detail/metadata.h diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/metadata.cc new file mode 100644 index 0000000000000..4607cd8512ef5 --- /dev/null +++ b/paddle/memory/detail/metadata.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/memory/detail/metadata.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) + : type(t), + index(i), + size(s), + total_size(ts), + left_buddy(l), + right_buddy(r) {} + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +inline size_t hash(const Metadata* metadata, size_t initial_seed) { + size_t seed = initial_seed; + + hash_combine(seed, (size_t)metadata->type); + hash_combine(seed, metadata->index); + hash_combine(seed, metadata->size); + hash_combine(seed, metadata->total_size); + hash_combine(seed, metadata->left_buddy); + hash_combine(seed, metadata->right_buddy); + + return seed; +} + +void Metadata::update_guards() { + guard_begin = hash(this, 1); + guard_end = hash(this, 2); +} + +bool Metadata::check_guards() const { + return guard_begin == hash(this, 1) && guard_end == hash(this, 2); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/metadata.h new file mode 100644 index 0000000000000..ddb826571b6d6 --- /dev/null +++ b/paddle/memory/detail/metadata.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/memory_block.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +class Metadata { + public: + Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + + public: + /*! \brief Update the guards when metadata is changed */ + void update_guards(); + + /*! \brief Check consistency to previous modification */ + bool check_guards() const; + + public: + // TODO(gangliao): compress this + // clang-format off + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + // clang-format on +}; + +} // namespace detail +} // namespace memory +} // namespace paddle From bbd3eab7ee88f02131edb41738a966aa0f1a0e88 Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 3 Jul 2017 19:54:32 +0800 Subject: [PATCH 13/38] ENH: Add Alloc for buddy Allocator * Free will be added soon --- paddle/memory/detail/buddy_allocator.cc | 157 ++++++++++++++++++++++-- paddle/memory/detail/buddy_allocator.h | 88 +++++++++---- 2 files changed, 209 insertions(+), 36 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index ebe680f5eea49..2462ba084b996 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -12,22 +12,161 @@ See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "paddle/memory/detail/buddy_allocator.h" +#include "glog/logging.h" namespace paddle { namespace memory { namespace detail { -BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator) - : pool_size_(pool_size), - max_pools_(max_pools), - system_allocator_(system_allocator) { - PADDLE_ASSERT(pool_size > 0); - PADDLE_ASSERT(max_pools > 0); +BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, + size_t min_chunk_size, size_t max_chunk_size) { + PADDLE_ASSERT(min_chunk_size > 0); + PADDLE_ASSERT(max_chunk_size > 0); PADDLE_ASSERT(system_allocator != nullptr); + + system_allocator_ = std::move(system_allocator); + min_chunk_size_ = min_chunk_size; + max_chunk_size_ = max_chunk_size; +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + DLOG(INFO) << "Allocate from system allocator."; + + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(); + } else { + DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(index, size); + + DLOG(INFO) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +#ifndef PADDLE_ONLY_CPU + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } + } +#endif // PADDLE_ONLY_CPU + + // Allocate a new maximum sized block + size_t index = 0; + void* p = system_allocator_->Alloc(index, max_chunk_size_); + + if (p == nullptr) return pool_.end(); + + DLOG(INFO) << " Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + max_chunk_size_, nullptr, nullptr); + + total_free_ += max_chunk_size_; + + // dump the block into pool + return pool_.insert({index, max_chunk_size_, p}).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound({index, size, nullptr}); + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + if (std::get<1>(*it) >= size) { + return it; + } + + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + + pool_.erase(it); + + DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + + block->split(cache_, size); + + DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_) + << ")"; + + block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_) + << ", " << block->right_buddy(cache_)->total_size(cache_) + << ")"; + + pool_.insert({block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_)}); + } + } + + return block; } } // namespace detail diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 82e6aaedc7199..38bedc9a18366 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -15,9 +15,15 @@ #pragma once #include "paddle/memory/detail/system_allocator.h" +#include "paddle/memory/detail/metadata.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cpu_info.h" +#include "paddle/platform/gpu_info.h" +#include #include #include +#include namespace paddle { namespace memory { @@ -25,55 +31,83 @@ namespace detail { class BuddyAllocator { public: - BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator); + BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size, + size_t max_chunk_size); + ~BuddyAllocator(); - void* Alloc(size_t size); + public: + void* Alloc(size_t unaligned_size); void Free(void*); size_t Used(); + public: + // Disable copy and assignment. + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + private: - struct Block { - size_t size_; - Block* left_; // left buddy - Block* right_; // right buddy - }; + // Tuple type: allocator index, memory size, memory address + using IndexSizeAddress = std::tuple; + using PoolSet = std::set; - // Initially, there is only one pool. If a Alloc founds not enough - // memory from that pool, and there has not been max_num_pools_, - // create a new pool by calling system_allocator_.Alloc(pool_size_). - std::vector pools_; + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); - size_t pool_size_; // the size of each pool; - size_t max_num_pools_; // the size of all pools; + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(); - SystemAllocator* system_allocator_; + /** + * \brief Find the suitable chunk from existing pool + * + * \param it pool iterator which contains suitable block. + * \param size the size of allocation. + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); - std::mutex mutex_; + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); - // Disable copy and assignment. - BuddyAllocator(const BuddyAllocator&) = delete; - BuddyAllocator& operator=(const BuddyAllocator&) = delete; + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + private: + PoolSet pool_; + + private: + // Unify the metadata format between GPU and CPU allocations + using MetadataCache = std::unordered_map; + MetadataCache cache_; + + private: + SystemAllocator* system_allocator_; + std::mutex mutex_; }; -BuddyAllocator* GetCPUBuddyAllocator() { - static BuddyAllocator* a = nullptr; +BuddyAllocator* GetCPUBuddyAllocator() { + static BuddyAllocator* a = nullptr; if (a == nullptr) { - a = new BuddyAllocator(); + a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); } return a; } #ifndef PADDLE_ONLY_CPU // The following code are for CUDA. -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static BuddyAllocator** as = NULL; +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; if (as == NULL) { - int gpu_num = platform::GetDeviceCount(); - as = new BuddyAllocator*[gpu_num]; + int gpu_num = platform::GpuDeviceCount(); + as = new BuddyAllocator*[gpu_num]; for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = new BuddyAllocator(); + as[gpu] = + new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); } } return as[gpu_id]; From 4e1617d05994bda1a9eb0e0b5b563249cc12f271 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 13:15:00 +0800 Subject: [PATCH 14/38] ENH: add buddy alloctor Free --- paddle/memory/detail/CMakeLists.txt | 3 + paddle/memory/detail/buddy_allocator.cc | 19 ++-- paddle/memory/detail/buddy_allocator.h | 10 +- paddle/memory/detail/memory_block.cc | 145 ++++++++++++++++++++++++ paddle/memory/detail/memory_block.h | 97 ++++++++++++++++ paddle/memory/detail/system_allocator.h | 1 + 6 files changed, 262 insertions(+), 13 deletions(-) create mode 100644 paddle/memory/detail/memory_block.cc create mode 100644 paddle/memory/detail/memory_block.h diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 6caa97a76bbfd..dbc98a8a62fb3 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -7,3 +7,6 @@ else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) endif(${WITH_GPU}) + +cc_library(metadata SRCS metadata.cc) +cc_library(buddy_allocator SRCS buddy_allocator.cc) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 2462ba084b996..e8d694327d8ce 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -58,17 +58,16 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // refill the pool if failure if (it == pool_.end()) { it = RefillPool(); + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } } else { DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it) << " at address " << reinterpret_cast(std::get<2>(*it))->data(); } - // if still failure, fail fatally - if (it == pool_.end()) { - return nullptr; - } - total_used_ += size; total_free_ -= size; @@ -76,6 +75,13 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { return reinterpret_cast(SplitToAlloc(it, size))->data(); } +void BuddyAllocator::Free(void* p) { + auto block = static_cast(p)->metadata(); + + // acquire the allocator lock + std::lock_guard lock(mutex_); +} + void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(index, size); @@ -140,17 +146,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, size_t size) { auto block = static_cast(std::get<2>(*it)); - pool_.erase(it); DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_) << ") into"; - block->split(cache_, size); DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_) << ")"; - block->set_type(cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 38bedc9a18366..4006bdcce8d1c 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -14,16 +14,16 @@ #pragma once -#include "paddle/memory/detail/system_allocator.h" #include "paddle/memory/detail/metadata.h" +#include "paddle/memory/detail/system_allocator.h" #include "paddle/platform/assert.h" #include "paddle/platform/cpu_info.h" #include "paddle/platform/gpu_info.h" -#include #include -#include +#include #include +#include namespace paddle { namespace memory { @@ -57,9 +57,9 @@ class BuddyAllocator { /*! \brief If existing chunks are not suitable, refill pool */ PoolSet::iterator RefillPool(); - /** + /** * \brief Find the suitable chunk from existing pool - * + * * \param it pool iterator which contains suitable block. * \param size the size of allocation. */ diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc new file mode 100644 index 0000000000000..1c9e87df49703 --- /dev/null +++ b/paddle/memory/detail/memory_block.cc @@ -0,0 +1,145 @@ +#include "paddle/memory/detail/memory_block.h" +#include "paddle/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy) { + cache.store(this, + MemoryBlockMetadata(t, index, size - overhead(), size, + static_cast(left_buddy), + static_cast(right_buddy))); +} + +MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { + return cache.load(this).type; +} + +size_t MemoryBlock::size(MetadataCache& cache) const { + return cache.load(this).size; +} + +size_t MemoryBlock::total_size(MetadataCache& cache) const { + return cache.load(this).total_size; +} + +MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { + return cache.load(this).left_buddy; +} + +MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { + return cache.load(this).right_buddy; +} + +void MemoryBlock::split(MetadataCache& cache, size_t size) { + // make sure the split fits + assert(total_size(cache) >= size); + + // bail out if there is no room for another partition + if (total_size(cache) - size <= overhead()) { + return; + } + + // find the position of the split + void* right_partition = reinterpret_cast(this) + size; + + size_t remaining_size = total_size(cache) - size; + + // Add the new block as a buddy + auto metadata = cache.load(this); + + // Write the metadata for the new block + auto new_block_right_buddy = metadata.right_buddy; + + cache.store(static_cast(right_partition), + MemoryBlockMetadata(FREE_MEMORY, index(cache), + remaining_size - overhead(), remaining_size, + this, new_block_right_buddy)); + + metadata.right_buddy = static_cast(right_partition); + metadata.size = size - overhead(); + metadata.total_size = size; + + cache.store(this, metadata); + + // Write metadata for the new block's right buddy + if (new_block_right_buddy != nullptr) { + auto buddy_metadata = cache.load(new_block_right_buddy); + + buddy_metadata.left_buddy = static_cast(right_partition); + + cache.store(new_block_right_buddy, buddy_metadata); + } +} + +void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { + // only free blocks can be merged + assert(type(cache) == FREE_MEMORY); + assert(right_buddy->type(cache) == FREE_MEMORY); + + auto metadata = cache.load(this); + + // link this->buddy's buddy + metadata.right_buddy = right_buddy->right_buddy(cache); + + // link buddy's buddy -> this + if (metadata.right_buddy != nullptr) { + auto buddy_metadata = cache.load(metadata.right_buddy); + + buddy_metadata.left_buddy = this; + + cache.store(metadata.right_buddy, buddy_metadata); + } + + metadata.size += right_buddy->total_size(cache); + metadata.total_size += right_buddy->total_size(cache); + + cache.store(this, metadata); + cache.store(right_buddy, + MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr)); +} + +void MemoryBlock::mark_as_free(MetadataCache& cache) { + // check for double free or corruption + assert(type(cache) != FREE_MEMORY); + assert(type(cache) != INVALID_MEMORY); + + set_type(cache, FREE_MEMORY); +} + +void MemoryBlock::set_type(MetadataCache& cache, Type t) { + auto metadata = cache.load(this); + + metadata.type = t; + + cache.store(this, metadata); +} + +bool MemoryBlock::has_left_buddy(MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +size_t MemoryBlock::index(MetadataCache& cache) const { + return cache.load(this).index; +} + +void* MemoryBlock::data() const { + return const_cast( + reinterpret_cast(this)) + + 1; +} + +MemoryBlock* MemoryBlock::metadata() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) - 1)); +} + +} // detail +} // memory +} // paddle diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h new file mode 100644 index 0000000000000..e2d39c31cfbca --- /dev/null +++ b/paddle/memory/detail/memory_block.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/metadata.h" + +#include +#include + +namespace paddle { +namespace memory { +namespace detail { + +// Forward Declaration +class Metadata; + +/*! \brief A class used to interpret the contents of a memory block */ +class MemoryBlock { + public: + // Unify the metadata format between GPU and CPU allocations + using MetadataCache = std::unordered_map; + + enum Type { + FREE_CHUNK, // memory is free and idle + ARENA_CHUNK, // memory is being occupied + HUGE_CHUNK, // memory is out of management + INVALID_CHUNK // memory is invalid + }; + + public: + void init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy); + + public: + /*! \brief The type of the allocation */ + Type type(MetadataCache& cache) const; + + /*! \brief The size of the data region */ + size_t size(MetadataCache& cache) const; + + /*! \brief An index to track the allocator */ + size_t index(MetadataCache& cache) const; + + /*! \brief The total size of the block */ + size_t total_size(MetadataCache& cache) const; + + /*! \brief Check the left buddy of the block */ + bool has_left_buddy(MetadataCache& cache) const; + + /*! \brief Check the right buddy of the block */ + bool has_right_buddy(MetadataCache& cache) const; + + /*! \brief Get the left buddy */ + MemoryBlock* left_buddy(MetadataCache& cache) const; + + /*! \brief Get the right buddy */ + MemoryBlock* right_buddy(MetadataCache& cache) const; + + public: + /*! \brief Split the allocation into left/right blocks */ + void split(MetadataCache& cache, size_t size); + + /*! \brief Merge left and right blocks together */ + void merge(MetadataCache& cache, MemoryBlock* right_buddy); + + /*! \brief Mark the allocation as free */ + void mark_as_free(MetadataCache& cache); + + /*! \brief Change the type of the allocation */ + void set_type(MetadataCache& cache, Type t); + + public: + /*! \brief Get a pointer to the memory block's data */ + void* data() const; + + /*! \brief Get a pointer to the memory block's metadata */ + MemoryBlock* metadata() const; + + public: + static size_t overhead(); +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index f3bbfef843542..555061a533ffe 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -48,6 +48,7 @@ class GPUAllocator : public SystemAllocator { virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); virtual bool UseGpu(); + private: size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; From ff36389452c1af6cc6a5f03b5ca52404ab20f108 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 15:21:24 +0800 Subject: [PATCH 15/38] ENH: code style --- paddle/memory/detail/buddy_allocator.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index e8d694327d8ce..eddfd9d13c71b 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -48,7 +48,6 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // if the allocation is huge, send directly to the system allocator if (size > max_chunk_size_) { DLOG(INFO) << "Allocate from system allocator."; - return SystemAlloc(size); } From 379434b243faeaf9fd4d38cf9f95dfe45cc563d5 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 17:21:09 +0800 Subject: [PATCH 16/38] Delete cmake in dynload --- paddle/platform/dynload/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 paddle/platform/dynload/CMakeLists.txt diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt deleted file mode 100644 index 9f829b7012865..0000000000000 --- a/paddle/platform/dynload/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) From 0ba63475659822bd146f1f1dcfc7eabca8b7047d Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 21:23:23 +0800 Subject: [PATCH 17/38] ENH: Add buddy allocator Free --- paddle/memory/detail/CMakeLists.txt | 9 ++-- paddle/memory/detail/buddy_allocator.cc | 41 ++++++++++++++++++- paddle/memory/detail/buddy_allocator.h | 2 +- paddle/memory/detail/memory_block.h | 2 +- .../detail/{metadata.cc => meta_data.cc} | 2 +- .../memory/detail/{metadata.h => meta_data.h} | 0 paddle/platform/cpu_info.h | 10 ----- paddle/platform/gpu_info.cc | 13 ++++++ paddle/platform/gpu_info.h | 6 +++ 9 files changed, 65 insertions(+), 20 deletions(-) rename paddle/memory/detail/{metadata.cc => meta_data.cc} (97%) rename paddle/memory/detail/{metadata.h => meta_data.h} (100%) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index dbc98a8a62fb3..c3167cd30aa31 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,12 +1,9 @@ if(${WITH_GPU}) - nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) - nv_test(system_allocator_test - SRCS system_allocator_test.cc - DEPS system_allocator gpu_info gflags) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) - cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) endif(${WITH_GPU}) +cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) -cc_library(metadata SRCS metadata.cc) +cc_library(meta_data SRCS meta_data.cc) cc_library(buddy_allocator SRCS buddy_allocator.cc) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index eddfd9d13c71b..f677feda0d559 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -75,10 +75,49 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { } void BuddyAllocator::Free(void* p) { + // Point back to metadata auto block = static_cast(p)->metadata(); - // acquire the allocator lock + // Acquire the allocator lock std::lock_guard lock(mutex_); + + DLOG(INFO) << "Free from address " << block; + + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + DLOG(INFO) << "Free directly from system allocator"; + system_allocator_->Free(block, block->total_size(cache_), + block->index(cache_)); + + // Invalidate GPU allocation from cache + if (system_allocator_->UseGpu()) { + cache_.erase(block); + } + return; + } + + block->mark_as_free(cache_); + + total_used_ -= block->total_size(cache_); + total_free_ += block->total_size(cache_); + + // Trying to merge the right buddy + if (block->has_right_buddy(cache_)) { + DLOG(INFO) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); + } + + // Trying to merge the left buddy + if (block->has_left_buddy(cache_)) { + DLOG(INFO) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); + } + + // Dumping this block into pool + DLOG(INFO) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; + pool_.insert({block->index(cache_), block->total_size(cache_), block}); + + // TODO(gangliao): Clean up if existing too much free memory } void* BuddyAllocator::SystemAlloc(size_t size) { diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 4006bdcce8d1c..49bd6cf9019e7 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/memory/detail/metadata.h" +#include "paddle/memory/detail/meta_data.h" #include "paddle/memory/detail/system_allocator.h" #include "paddle/platform/assert.h" #include "paddle/platform/cpu_info.h" diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h index e2d39c31cfbca..2945520113ae4 100644 --- a/paddle/memory/detail/memory_block.h +++ b/paddle/memory/detail/memory_block.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/memory/detail/metadata.h" +#include "paddle/memory/detail/meta_data.h" #include #include diff --git a/paddle/memory/detail/metadata.cc b/paddle/memory/detail/meta_data.cc similarity index 97% rename from paddle/memory/detail/metadata.cc rename to paddle/memory/detail/meta_data.cc index 4607cd8512ef5..a3b7a9b4fed94 100644 --- a/paddle/memory/detail/metadata.cc +++ b/paddle/memory/detail/meta_data.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/memory/detail/metadata.h" +#include "paddle/memory/detail/meta_data.h" #include diff --git a/paddle/memory/detail/metadata.h b/paddle/memory/detail/meta_data.h similarity index 100% rename from paddle/memory/detail/metadata.h rename to paddle/memory/detail/meta_data.h diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h index edd76517a6481..8df7c7b4bca5b 100644 --- a/paddle/platform/cpu_info.h +++ b/paddle/platform/cpu_info.h @@ -28,15 +28,5 @@ size_t CpuMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CpuMaxChunkSize(); -int GetCurrentDeviceId(void) { - int device_id; - throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed"); - return device_id; -} - -void SetDeviceId(int device_id) { - throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed"); -} - } // namespace platform } // namespace paddle diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index d6c6fe34ef41e..05a243c50696a 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -31,6 +31,19 @@ int GpuDeviceCount() { return count; } +int GetCurrentDeviceId() { + int device_id; + throw_on_error( + cudaGetDevice(&device_id), + "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId"); + return device_id; +} + +void SetDeviceId(int id) { + throw_on_error(cudaSetDevice(id), + "cudaSetDevice failed in paddle::platform::SetDeviceId"); +} + void GpuMemoryUsage(size_t& available, size_t& total) { throw_on_error(cudaMemGetInfo(&available, &total), "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index d7bf0f4093ef3..81ee5f6e0a95e 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -24,6 +24,12 @@ namespace platform { //! Get the total number of GPU devices in system. int GpuDeviceCount(); +//! Get the current GPU device id in system. +int GetCurrentDeviceId(); + +//! Set the GPU device id for next execution. +void SetDeviceId(int device_id); + //!Get the memory usage of current GPU device. void GpuMemoryUsage(size_t& available, size_t& total); From 4dc3c9e0cc1b6ec5dbc324f4804974247ca6506f Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 4 Jul 2017 23:28:15 +0800 Subject: [PATCH 18/38] ENH: Add paddle_memory for external usage --- paddle/memory/CMakeLists.txt | 10 ++++ paddle/memory/detail/CMakeLists.txt | 6 +++ paddle/memory/detail/buddy_allocator.cc | 12 ++--- paddle/memory/detail/buddy_allocator.h | 2 +- paddle/memory/detail/memory_block.cc | 56 +++++++++++-------- paddle/memory/detail/memory_block.h | 10 +--- paddle/memory/detail/meta_cache.cc | 57 ++++++++++++++++++++ paddle/memory/detail/meta_cache.h | 71 +++++++++++++++++++++++++ paddle/memory/detail/meta_data.cc | 8 +++ paddle/memory/detail/meta_data.h | 1 + 10 files changed, 196 insertions(+), 37 deletions(-) create mode 100644 paddle/memory/detail/meta_cache.cc create mode 100644 paddle/memory/detail/meta_cache.h diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 3943c3cfad31d..8c290712fc9fb 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1 +1,11 @@ add_subdirectory(detail) + +cc_library(memory + SRCS + memory.cc) + +cc_library(paddle_memory + DEPS + memory meta_data + meta_cache memory_block + buddy_allocator system_allocator) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index c3167cd30aa31..4fdabc8eebd4f 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -3,7 +3,13 @@ if(${WITH_GPU}) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) endif(${WITH_GPU}) + cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_library(meta_data SRCS meta_data.cc) + +cc_library(meta_cache SRCS meta_cache.cc) + +cc_library(memory_block SRCS memory_block.cc) + cc_library(buddy_allocator SRCS buddy_allocator.cc) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index f677feda0d559..aa5b6b557c5dd 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -20,14 +20,14 @@ namespace memory { namespace detail { BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, - size_t min_chunk_size, size_t max_chunk_size) { + size_t min_chunk_size, size_t max_chunk_size) + : min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + cache_(system_allocator->UseGpu()), + system_allocator_(std::move(system_allocator)) { PADDLE_ASSERT(min_chunk_size > 0); PADDLE_ASSERT(max_chunk_size > 0); PADDLE_ASSERT(system_allocator != nullptr); - - system_allocator_ = std::move(system_allocator); - min_chunk_size_ = min_chunk_size; - max_chunk_size_ = max_chunk_size; } inline size_t align(size_t size, size_t alignment) { @@ -90,7 +90,7 @@ void BuddyAllocator::Free(void* p) { // Invalidate GPU allocation from cache if (system_allocator_->UseGpu()) { - cache_.erase(block); + cache_.invalidate(block); } return; } diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 49bd6cf9019e7..ecf23b77ae8ff 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/memory/detail/meta_cache.h" #include "paddle/memory/detail/meta_data.h" #include "paddle/memory/detail/system_allocator.h" #include "paddle/platform/assert.h" @@ -80,7 +81,6 @@ class BuddyAllocator { private: // Unify the metadata format between GPU and CPU allocations - using MetadataCache = std::unordered_map; MetadataCache cache_; private: diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc index 1c9e87df49703..eaa97e7b4ad3d 100644 --- a/paddle/memory/detail/memory_block.cc +++ b/paddle/memory/detail/memory_block.cc @@ -1,4 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + #include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_cache.h" +#include "paddle/memory/detail/meta_data.h" #include "paddle/platform/assert.h" namespace paddle { @@ -7,10 +23,9 @@ namespace detail { void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, void* left_buddy, void* right_buddy) { - cache.store(this, - MemoryBlockMetadata(t, index, size - overhead(), size, - static_cast(left_buddy), - static_cast(right_buddy))); + cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, + static_cast(left_buddy), + static_cast(right_buddy))); } MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { @@ -35,10 +50,10 @@ MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { void MemoryBlock::split(MetadataCache& cache, size_t size) { // make sure the split fits - assert(total_size(cache) >= size); + PADDLE_ASSERT(total_size(cache) >= size); // bail out if there is no room for another partition - if (total_size(cache) - size <= overhead()) { + if (total_size(cache) - size <= sizeof(Metadata)) { return; } @@ -53,13 +68,13 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) { // Write the metadata for the new block auto new_block_right_buddy = metadata.right_buddy; - cache.store(static_cast(right_partition), - MemoryBlockMetadata(FREE_MEMORY, index(cache), - remaining_size - overhead(), remaining_size, - this, new_block_right_buddy)); + cache.store( + static_cast(right_partition), + Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), + remaining_size, this, new_block_right_buddy)); metadata.right_buddy = static_cast(right_partition); - metadata.size = size - overhead(); + metadata.size = size - sizeof(Metadata); metadata.total_size = size; cache.store(this, metadata); @@ -76,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) { void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { // only free blocks can be merged - assert(type(cache) == FREE_MEMORY); - assert(right_buddy->type(cache) == FREE_MEMORY); + PADDLE_ASSERT(type(cache) == FREE_MEMORY); + PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY); auto metadata = cache.load(this); @@ -97,16 +112,15 @@ void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { metadata.total_size += right_buddy->total_size(cache); cache.store(this, metadata); - cache.store(right_buddy, - MemoryBlockMetadata(INVALID_MEMORY, 0, 0, 0, nullptr, nullptr)); + cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); } void MemoryBlock::mark_as_free(MetadataCache& cache) { // check for double free or corruption - assert(type(cache) != FREE_MEMORY); - assert(type(cache) != INVALID_MEMORY); + PADDLE_ASSERT(type(cache) != FREE_CHUNK); + PADDLE_ASSERT(type(cache) != INVALID_CHUNK); - set_type(cache, FREE_MEMORY); + set_type(cache, FREE_CHUNK); } void MemoryBlock::set_type(MetadataCache& cache, Type t) { @@ -130,14 +144,12 @@ size_t MemoryBlock::index(MetadataCache& cache) const { } void* MemoryBlock::data() const { - return const_cast( - reinterpret_cast(this)) + - 1; + return const_cast(reinterpret_cast(this)) + 1; } MemoryBlock* MemoryBlock::metadata() const { return const_cast(reinterpret_cast( - reinterpret_cast(this) - 1)); + reinterpret_cast(this) - 1)); } } // detail diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h index 2945520113ae4..a5168b519f3a3 100644 --- a/paddle/memory/detail/memory_block.h +++ b/paddle/memory/detail/memory_block.h @@ -14,24 +14,18 @@ #pragma once -#include "paddle/memory/detail/meta_data.h" - #include -#include namespace paddle { namespace memory { namespace detail { -// Forward Declaration -class Metadata; +// Forward Declarations +class MetadataCache; /*! \brief A class used to interpret the contents of a memory block */ class MemoryBlock { public: - // Unify the metadata format between GPU and CPU allocations - using MetadataCache = std::unordered_map; - enum Type { FREE_CHUNK, // memory is free and idle ARENA_CHUNK, // memory is being occupied diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc new file mode 100644 index 0000000000000..189ab4fc7bb74 --- /dev/null +++ b/paddle/memory/detail/meta_cache.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/memory/detail/meta_cache.h" +#include "paddle/memory/detail/memory_block.h" +#include "paddle/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} + +Metadata MetadataCache::load(const MemoryBlock* block) { + if (uses_gpu_) { + auto existing_metadata = cache_.find(block); + assert(existing_metadata->second.check_guards()); + return existing_metadata->second; + } else { + PADDLE_ASSERT(reinterpret_cast(block)->check_guards()); + return *reinterpret_cast(block); + } +} + +void MetadataCache::store(MemoryBlock* block, + const Metadata& original_metadata) { + auto metadata = original_metadata; + + metadata.update_guards(); + + if (uses_gpu_) { + cache_[block] = metadata; + } else { + *reinterpret_cast(block) = metadata; + } +} + +void MetadataCache::invalidate(MemoryBlock* block) { + if (uses_gpu_) { + cache_.erase(block); + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h new file mode 100644 index 0000000000000..3ca1020d22ead --- /dev/null +++ b/paddle/memory/detail/meta_cache.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +/*! A cache for accessing memory block meta-data that may be expensive to access + directly. + + Note: this class exists to unify the metadata format between GPU and CPU + allocations. + It should be removed when the CPU can access all GPU allocations directly + via UVM. +*/ +class MetadataCache { + public: + MetadataCache(bool uses_gpu); + + public: + /*! \brief Load the associated metadata for the specified memory block. */ + Metadata load(const MemoryBlock*); + + /*! \brief Store the associated metadata for the specified memory block. */ + void store(MemoryBlock*, const Metadata&); + + public: + /*! \brief Acquire any external metadata updates. */ + void acquire(MemoryBlock*); + + /*! \brief Publish any local updates externally. */ + void release(MemoryBlock*); + + /*! \brief Indicate that the specified metadata will no longer be used */ + void invalidate(MemoryBlock*); + + public: + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + private: + bool uses_gpu_; + + private: + typedef std::unordered_map MetadataMap; + + private: + MetadataMap cache_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc index a3b7a9b4fed94..70c5c1f439e84 100644 --- a/paddle/memory/detail/meta_data.cc +++ b/paddle/memory/detail/meta_data.cc @@ -29,6 +29,14 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, left_buddy(l), right_buddy(r) {} +Metadata::Metadata() + : type(MemoryBlock::INVALID_CHUNK), + index(0), + size(0), + total_size(0), + left_buddy(nullptr), + right_buddy(nullptr) {} + template inline void hash_combine(std::size_t& seed, const T& v) { std::hash hasher; diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h index ddb826571b6d6..628cf1f2e347e 100644 --- a/paddle/memory/detail/meta_data.h +++ b/paddle/memory/detail/meta_data.h @@ -26,6 +26,7 @@ class Metadata { public: Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, MemoryBlock* r); + Metadata(); public: /*! \brief Update the guards when metadata is changed */ From d0ad0314bb868b9e0c1aa77f74ca0d2d3e8b8ef0 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 5 Jul 2017 16:33:18 +0800 Subject: [PATCH 19/38] FIX: glog dependency --- paddle/memory/detail/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 4fdabc8eebd4f..6cb6422e473da 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -12,4 +12,4 @@ cc_library(meta_cache SRCS meta_cache.cc) cc_library(memory_block SRCS memory_block.cc) -cc_library(buddy_allocator SRCS buddy_allocator.cc) +cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog) From ada1c20bbc2520d566b7d2bd2a56cf94cbcddd27 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 5 Jul 2017 19:16:02 +0800 Subject: [PATCH 20/38] FIX: Buddy Allocator Free with Merge feature --- paddle/memory/detail/buddy_allocator.cc | 33 ++++++++++++++++++++++--- paddle/memory/detail/buddy_allocator.h | 15 +++++++---- paddle/memory/detail/memory_block.cc | 4 +-- paddle/platform/CMakeLists.txt | 2 +- 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index aa5b6b557c5dd..9f334a7048fc6 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -89,9 +89,8 @@ void BuddyAllocator::Free(void* p) { block->index(cache_)); // Invalidate GPU allocation from cache - if (system_allocator_->UseGpu()) { - cache_.invalidate(block); - } + cache_.invalidate(block); + return; } @@ -104,12 +103,35 @@ void BuddyAllocator::Free(void* p) { if (block->has_right_buddy(cache_)) { DLOG(INFO) << "Merging this block " << block << " with its right buddy " << block->right_buddy(cache_); + + auto right_buddy = block->right_buddy(cache_); + + if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_), + right_buddy}); + + // merge its right buddy to the block + block->merge(cache_, right_buddy); + } } // Trying to merge the left buddy if (block->has_left_buddy(cache_)) { DLOG(INFO) << "Merging this block " << block << " with its left buddy " << block->left_buddy(cache_); + + auto left_buddy = block->left_buddy(cache_); + + if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_), + left_buddy}); + + // merge the block to its left buddy + left_buddy->merge(cache_, block); + block = left_buddy; + } } // Dumping this block into pool @@ -167,13 +189,16 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { while (1) { auto it = pool_.lower_bound({index, size, nullptr}); + + // no match chunk memory if (it == pool_.end()) return it; if (std::get<0>(*it) > index) { + // find suitable one if (std::get<1>(*it) >= size) { return it; } - + // update and continue index = std::get<0>(*it); continue; } diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index ecf23b77ae8ff..2fd9c8162a98a 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -42,14 +42,14 @@ class BuddyAllocator { void Free(void*); size_t Used(); - public: + private: // Disable copy and assignment. BuddyAllocator(const BuddyAllocator&) = delete; BuddyAllocator& operator=(const BuddyAllocator&) = delete; - private: - // Tuple type: allocator index, memory size, memory address + // Tuple (allocator index, memory size, memory address) using IndexSizeAddress = std::tuple; + // Each element in PoolSet is a free allocation using PoolSet = std::set; /*! \brief Allocate fixed-size memory from system */ @@ -57,7 +57,6 @@ class BuddyAllocator { /*! \brief If existing chunks are not suitable, refill pool */ PoolSet::iterator RefillPool(); - /** * \brief Find the suitable chunk from existing pool * @@ -77,13 +76,19 @@ class BuddyAllocator { size_t max_chunk_size_; // the maximum size of each chunk private: + /** + * \brief A list of free allocation + * + * \note Only store free chunk memory in pool + */ PoolSet pool_; private: - // Unify the metadata format between GPU and CPU allocations + /*! Unify the metadata format between GPU and CPU allocations */ MetadataCache cache_; private: + /*! Allocate CPU/GPU memory from system */ SystemAllocator* system_allocator_; std::mutex mutex_; }; diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc index eaa97e7b4ad3d..bc67bcef0fdf9 100644 --- a/paddle/memory/detail/memory_block.cc +++ b/paddle/memory/detail/memory_block.cc @@ -91,8 +91,8 @@ void MemoryBlock::split(MetadataCache& cache, size_t size) { void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { // only free blocks can be merged - PADDLE_ASSERT(type(cache) == FREE_MEMORY); - PADDLE_ASSERT(right_buddy->type(cache) == FREE_MEMORY); + PADDLE_ASSERT(type(cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); auto metadata = cache.load(this); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 2f3d1c061e37c..0ad11f1b106be 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) -cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc) +cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags) From 74691789e9e5ee782adb003642f66699603b20e2 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 01:16:03 +0800 Subject: [PATCH 21/38] ENH: add memory unit test --- paddle/memory/CMakeLists.txt | 6 ++-- paddle/memory/detail/CMakeLists.txt | 4 +-- paddle/memory/detail/buddy_allocator.cc | 33 +++++++++++------ paddle/memory/detail/buddy_allocator.h | 27 -------------- paddle/memory/memory.cc | 42 ++++++++++++++++++---- paddle/memory/memory_test.cc | 48 +++++++++++++++++++++++++ paddle/platform/gpu_info.cc | 2 +- 7 files changed, 112 insertions(+), 50 deletions(-) create mode 100644 paddle/memory/memory_test.cc diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 8c290712fc9fb..fac442cca56b8 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1,11 +1,11 @@ add_subdirectory(detail) -cc_library(memory - SRCS - memory.cc) +cc_library(memory SRCS memory.cc) cc_library(paddle_memory DEPS memory meta_data meta_cache memory_block buddy_allocator system_allocator) + +cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 6cb6422e473da..b9c3fc31c1523 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,7 +1,7 @@ if(${WITH_GPU}) - nv_library(system_allocator SRCS system_allocator.cc DEPS gflags gpu_info) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) else(${WITH_GPU}) - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) endif(${WITH_GPU}) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 9f334a7048fc6..ed2eedf9af800 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -24,10 +24,20 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, : min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), cache_(system_allocator->UseGpu()), - system_allocator_(std::move(system_allocator)) { - PADDLE_ASSERT(min_chunk_size > 0); - PADDLE_ASSERT(max_chunk_size > 0); - PADDLE_ASSERT(system_allocator != nullptr); + system_allocator_(std::move(system_allocator)) {} + +BuddyAllocator::~BuddyAllocator() { + DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_ + << ")"; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + pool_.erase(pool_.begin()); + } } inline size_t align(size_t size, size_t alignment) { @@ -62,7 +72,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { return nullptr; } } else { - DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it) + DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it) << " at address " << reinterpret_cast(std::get<2>(*it))->data(); } @@ -142,6 +152,8 @@ void BuddyAllocator::Free(void* p) { // TODO(gangliao): Clean up if existing too much free memory } +size_t BuddyAllocator::Used() { return total_used_; } + void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; void* p = system_allocator_->Alloc(index, size); @@ -172,7 +184,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { if (p == nullptr) return pool_.end(); - DLOG(INFO) << " Creating and inserting new block " << p + DLOG(INFO) << "Creating and inserting new block " << p << " from system allocator"; static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, @@ -211,20 +223,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, auto block = static_cast(std::get<2>(*it)); pool_.erase(it); - DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_) + DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_) << ") into"; block->split(cache_, size); - DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_) + DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_) << ")"; block->set_type(cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist if (block->has_right_buddy(cache_)) { if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { - DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_) - << ", " << block->right_buddy(cache_)->total_size(cache_) - << ")"; + DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; pool_.insert({block->right_buddy(cache_)->index(cache_), block->right_buddy(cache_)->total_size(cache_), diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 2fd9c8162a98a..eeb2dc88364bb 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -93,33 +93,6 @@ class BuddyAllocator { std::mutex mutex_; }; -BuddyAllocator* GetCPUBuddyAllocator() { - static BuddyAllocator* a = nullptr; - if (a == nullptr) { - a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(), - platform::CpuMaxChunkSize()); - } - return a; -} - -#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. - -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static BuddyAllocator** as = NULL; - if (as == NULL) { - int gpu_num = platform::GpuDeviceCount(); - as = new BuddyAllocator*[gpu_num]; - for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = - new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(), - platform::GpuMaxChunkSize()); - } - } - return as[gpu_id]; -} - -#endif // PADDLE_ONLY_CPU - } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 0d123d99e234a..dde6ff0ef3e54 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -22,37 +22,67 @@ limitations under the License. */ namespace paddle { namespace memory { +detail::BuddyAllocator* GetCPUBuddyAllocator() { + static detail::BuddyAllocator* a = nullptr; + if (a == nullptr) { + a = new detail::BuddyAllocator(new detail::CPUAllocator, + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); + } + return a; +} + +#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. + +detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static detail::BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GpuDeviceCount(); + as = new detail::BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + platform::SetDeviceId(gpu); + as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + } + } + return as[gpu_id]; +} + +#endif // PADDLE_ONLY_CPU + void* Alloc(platform::Place pl, size_t size) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { size_t gpu_id = boost::get(pl).device; - return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size); + return GetGPUBuddyAllocator(gpu_id)->Alloc(size); } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return detail::GetCPUBuddyAllocator()->Alloc(size); + return GetCPUBuddyAllocator()->Alloc(size); } void Free(paddle::platform::Place pl, void* p) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { size_t gpu_id = boost::get(pl).device; - detail::GetGPUBuddyAllocator(gpu_id)->Free(p); + GetGPUBuddyAllocator(gpu_id)->Free(p); + return; } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - detail::GetCPUBuddyAllocator()->Free(p); + GetCPUBuddyAllocator()->Free(p); } size_t Used(paddle::platform::Place pl) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { size_t gpu_id = boost::get(pl).device; - return detail::GetGPUBuddyAllocator(gpu_id)->Used(); + return GetGPUBuddyAllocator(gpu_id)->Used(); } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return detail::GetCPUBuddyAllocator()->Used(); + return GetCPUBuddyAllocator()->Used(); } } // namespace memory diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc new file mode 100644 index 0000000000000..a98ccbb806830 --- /dev/null +++ b/paddle/memory/memory_test.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/memory.h" +#include "paddle/platform/place.h" + +#include "gtest/gtest.h" + +TEST(BuddyAllocator, CPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CPUPlace cpu; + p = paddle::memory::Alloc(cpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::memory::Free(cpu, p); +} + +#ifndef PADDLE_ONLY_CPU + +TEST(BuddyAllocator, GPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::GPUPlace gpu(0); + p = paddle::memory::Alloc(gpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::memory::Free(gpu, p); +} + +#endif // PADDLE_ONLY_CPU diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 05a243c50696a..fe475d23ce08b 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -56,7 +56,7 @@ size_t GpuMaxAllocSize() { GpuMemoryUsage(available, total); // Reserve the rest for page tables, etc. - return total * FLAGS_fraction_of_gpu_memory_to_use; + return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); } size_t GpuMinChunkSize() { From 936cd1e11885daf35ae34b9fbf11a798e9e51022 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 01:34:51 +0800 Subject: [PATCH 22/38] FIX: code format --- paddle/memory/memory.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index dde6ff0ef3e54..43f2084e8dfd0 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -29,7 +29,7 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() { platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); } - return a; + return a; } #ifndef PADDLE_ONLY_CPU // The following code are for CUDA. From 5d2e8edb3ec86e13b7684cbe372650d21fe7954d Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 01:37:20 +0800 Subject: [PATCH 23/38] FIX: dynamic loader deps --- paddle/platform/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 0ad11f1b106be..bf3e8f31abe44 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -6,4 +6,4 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) -cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags) +cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog) From 3ad8e364715915fba5909c137834e34f38b6e9ac Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 11:24:01 +0800 Subject: [PATCH 24/38] FIX: merge static libs with propagation dependencies --- cmake/generic.cmake | 51 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index cae9524b2fe14..87d8caaec4059 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -99,15 +99,37 @@ function(merge_static_libs TARGET_NAME) set(libs ${ARGN}) list(REMOVE_DUPLICATES libs) - # First get the file names of the libraries to be merged + # Get all propagation dependencies from the merged libraries foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${dummyfile} + COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + add_library(${TARGET_NAME} STATIC ${dummyfile}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() + # Get the file name of the generated library + set(outlibfile "$") + if(APPLE) # Use OSX's libtool to merge archives - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") - add_library(${TARGET_NAME} STATIC ${dummyfile}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) @@ -117,7 +139,8 @@ function(merge_static_libs TARGET_NAME) set(objdir ${lib}.objdir) add_custom_command(OUTPUT ${objdir} - COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}) + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) add_custom_command(OUTPUT ${objlistfile} COMMAND ${CMAKE_AR} -x "$" @@ -125,23 +148,9 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - # Empty dummy source file that goes into merged library - set(mergebase ${lib}.mergebase.c) - add_custom_command(OUTPUT ${mergebase} - COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} - DEPENDS ${objlistfile}) - - list(APPEND mergebases "${mergebase}") - endforeach() - - # We need a target for the output merged library - add_library(${TARGET_NAME} STATIC ${mergebases}) - set(outlibfile "$") - - foreach(lib ${libs}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist" - WORKING_DIRECTORY ${lib}.objdir) + COMMAND ${CMAKE_AR} ru ${outlibfile} *.o + WORKING_DIRECTORY ${objdir}) endforeach() add_custom_command(TARGET ${TARGET_NAME} POST_BUILD From a669bf48d966a92206c57d72258bb625b5ff2fbc Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 13:38:11 +0800 Subject: [PATCH 25/38] FIX: explicit construct pool element --- paddle/memory/detail/buddy_allocator.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index ed2eedf9af800..2cfacec46c108 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -118,8 +118,9 @@ void BuddyAllocator::Free(void* p) { if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { // Take away right buddy from pool - pool_.erase({right_buddy->index(cache_), right_buddy->total_size(cache_), - right_buddy}); + pool_.erase(IndexSizeAddress(right_buddy->index(cache_), + right_buddy->total_size(cache_), + right_buddy)); // merge its right buddy to the block block->merge(cache_, right_buddy); @@ -135,8 +136,8 @@ void BuddyAllocator::Free(void* p) { if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { // Take away right buddy from pool - pool_.erase({left_buddy->index(cache_), left_buddy->total_size(cache_), - left_buddy}); + pool_.erase(IndexSizeAddress(left_buddy->index(cache_), + left_buddy->total_size(cache_), left_buddy)); // merge the block to its left buddy left_buddy->merge(cache_, block); @@ -147,7 +148,8 @@ void BuddyAllocator::Free(void* p) { // Dumping this block into pool DLOG(INFO) << "Inserting free block (" << block << ", " << block->total_size(cache_) << ")"; - pool_.insert({block->index(cache_), block->total_size(cache_), block}); + pool_.insert( + IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); // TODO(gangliao): Clean up if existing too much free memory } @@ -193,14 +195,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { total_free_ += max_chunk_size_; // dump the block into pool - return pool_.insert({index, max_chunk_size_, p}).first; + return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; } BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { size_t index = 0; while (1) { - auto it = pool_.lower_bound({index, size, nullptr}); + auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); // no match chunk memory if (it == pool_.end()) return it; @@ -237,9 +239,10 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " << block->right_buddy(cache_)->total_size(cache_) << ")"; - pool_.insert({block->right_buddy(cache_)->index(cache_), - block->right_buddy(cache_)->total_size(cache_), - block->right_buddy(cache_)}); + pool_.insert( + IndexSizeAddress(block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_))); } } From adf8c95b62fc5ef1f608bc06dce32bb4b396828c Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 15:40:22 +0800 Subject: [PATCH 26/38] FIX: propagation dependencies under linux --- cmake/generic.cmake | 68 ++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 87d8caaec4059..3900ea26048d3 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -104,36 +104,32 @@ function(merge_static_libs TARGET_NAME) list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) endforeach() - # To produce a library we need at least one source file. - # It is created by add_custom_command below and will helps - # also help to track dependencies. - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - - # Make the generated dummy source file depended on all static input - # libs. If input lib changes,the source file is touched - # which causes the desired effect (relink). - add_custom_command(OUTPUT ${dummyfile} - COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} - DEPENDS ${libs}) - - # Generate dummy staic lib - file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") - add_library(${TARGET_NAME} STATIC ${dummyfile}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) + if(APPLE) # Use OSX's libtool to merge archives + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - foreach(lib ${libs}) - # Get the file names of the libraries to be merged - set(libfiles ${libfiles} $) - endforeach() + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${dummyfile} + COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile} + DEPENDS ${libs}) - # Get the file name of the generated library - set(outlibfile "$") + # Generate dummy staic lib + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + add_library(${TARGET_NAME} STATIC ${dummyfile}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) - if(APPLE) # Use OSX's libtool to merge archives + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}) - else() # general UNIX: use "ar" to extract objects and re-add to a common lib + else() # general UNIX: use "ar" to extract objects and re-add to a common lib foreach(lib ${libs}) set(objlistfile ${lib}.objlist) # list of objects in the input library set(objdir ${lib}.objdir) @@ -148,13 +144,27 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} ru ${outlibfile} *.o - WORKING_DIRECTORY ${objdir}) + # Empty dummy source file that goes into merged library + set(mergebase ${lib}.mergebase.c) + add_custom_command(OUTPUT ${mergebase} + COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} + DEPENDS ${objlistfile}) + + list(APPEND mergebases "${mergebase}") endforeach() - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_RANLIB} ${outlibfile}) + add_library(${TARGET_NAME} STATIC ${mergebases}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(outlibfile "$") + + foreach(lib ${libs}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} cr ${outlibfile} *.o + COMMAND ${CMAKE_RANLIB} ${outlibfile} + WORKING_DIRECTORY ${lib}.objdir) + endforeach() endif() endfunction(merge_static_libs) From ddfa6cf0d1fe91f8bf2e1d55841afee9e30d1859 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 6 Jul 2017 17:07:04 +0800 Subject: [PATCH 27/38] FIX: remove boost from memory folder --- paddle/memory/memory.cc | 56 +++++++++++++++++++---------------------- paddle/memory/memory.h | 11 +++++--- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 43f2084e8dfd0..def580f7a4b22 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -32,7 +32,22 @@ detail::BuddyAllocator* GetCPUBuddyAllocator() { return a; } -#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. +template <> +void* Alloc(platform::CPUPlace place, size_t size) { + return GetCPUBuddyAllocator()->Alloc(size); +} + +template <> +void Free(platform::CPUPlace place, void* p) { + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(platform::CPUPlace place) { + return GetCPUBuddyAllocator()->Used(); +} + +#ifndef PADDLE_ONLY_CPU detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static detail::BuddyAllocator** as = NULL; @@ -49,41 +64,22 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { return as[gpu_id]; } -#endif // PADDLE_ONLY_CPU +template <> +void* Alloc(platform::GPUPlace place, size_t size) { + return GetGPUBuddyAllocator(place.device)->Alloc(size); +} -void* Alloc(platform::Place pl, size_t size) { -#ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - return GetGPUBuddyAllocator(gpu_id)->Alloc(size); - } -#endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return GetCPUBuddyAllocator()->Alloc(size); +template <> +void Free(platform::GPUPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); } -void Free(paddle::platform::Place pl, void* p) { -#ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - GetGPUBuddyAllocator(gpu_id)->Free(p); - return; - } -#endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - GetCPUBuddyAllocator()->Free(p); +template <> +size_t Used(platform::GPUPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); } -size_t Used(paddle::platform::Place pl) { -#ifndef PADDLE_ONLY_CPU - if (paddle::platform::is_gpu_place(pl)) { - size_t gpu_id = boost::get(pl).device; - return GetGPUBuddyAllocator(gpu_id)->Used(); - } #endif // PADDLE_ONLY_CPU - PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return GetCPUBuddyAllocator()->Used(); -} } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index a33092bade65e..2d6f4fd2a08ee 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,9 +19,14 @@ limitations under the License. */ namespace paddle { namespace memory { -void* Alloc(paddle::platform::Place, size_t); -void Free(paddle::platform::Place, void*); -size_t Used(paddle::platform::Place); +template +void* Alloc(Place, size_t); + +template +void Free(Place, void*); + +template +size_t Used(Place); } // namespace memory } // namespace paddle From 199b5fcb45c69560de1b24b3147f5e7db309abe3 Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 10 Jul 2017 11:22:17 +0800 Subject: [PATCH 28/38] ENH: refine code comments --- paddle/memory/detail/buddy_allocator.h | 3 ++- paddle/memory/detail/meta_cache.h | 25 +++++++++--------------- paddle/memory/detail/system_allocator.cc | 4 ++-- paddle/memory/detail/system_allocator.h | 6 +++--- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index eeb2dc88364bb..a89dd8eb7c19d 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -42,7 +42,7 @@ class BuddyAllocator { void Free(void*); size_t Used(); - private: + public: // Disable copy and assignment. BuddyAllocator(const BuddyAllocator&) = delete; BuddyAllocator& operator=(const BuddyAllocator&) = delete; @@ -57,6 +57,7 @@ class BuddyAllocator { /*! \brief If existing chunks are not suitable, refill pool */ PoolSet::iterator RefillPool(); + /** * \brief Find the suitable chunk from existing pool * diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h index 3ca1020d22ead..ca0789779e273 100644 --- a/paddle/memory/detail/meta_cache.h +++ b/paddle/memory/detail/meta_cache.h @@ -23,14 +23,14 @@ namespace paddle { namespace memory { namespace detail { -/*! A cache for accessing memory block meta-data that may be expensive to access - directly. - - Note: this class exists to unify the metadata format between GPU and CPU - allocations. - It should be removed when the CPU can access all GPU allocations directly - via UVM. -*/ +/** + * \brief A cache for accessing memory block meta-data that may be expensive + * to access directly. + * + * \note This class exists to unify the metadata format between GPU and CPU + * allocations. It should be removed when the CPU can access all GPU + * allocations directly via UVM. + */ class MetadataCache { public: MetadataCache(bool uses_gpu); @@ -42,14 +42,7 @@ class MetadataCache { /*! \brief Store the associated metadata for the specified memory block. */ void store(MemoryBlock*, const Metadata&); - public: - /*! \brief Acquire any external metadata updates. */ - void acquire(MemoryBlock*); - - /*! \brief Publish any local updates externally. */ - void release(MemoryBlock*); - - /*! \brief Indicate that the specified metadata will no longer be used */ + /*! \brief Indicate that the specified metadata will no longer be used. */ void invalidate(MemoryBlock*); public: diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 75a2c91ef938e..1579174b1a6ff 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -60,7 +60,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { free(p); } -bool CPUAllocator::UseGpu() { return false; } +bool CPUAllocator::UseGpu() const { return false; } #ifndef PADDLE_ONLY_CPU @@ -133,7 +133,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { } } -bool GPUAllocator::UseGpu() { return true; } +bool GPUAllocator::UseGpu() const { return true; } #endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 555061a533ffe..04efcd9709445 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -32,14 +32,14 @@ class SystemAllocator { virtual ~SystemAllocator() {} virtual void* Alloc(size_t& index, size_t size) = 0; virtual void Free(void* p, size_t size, size_t index) = 0; - virtual bool UseGpu() = 0; + virtual bool UseGpu() const = 0; }; class CPUAllocator : public SystemAllocator { public: virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu(); + virtual bool UseGpu() const; }; #ifndef PADDLE_ONLY_CPU @@ -47,7 +47,7 @@ class GPUAllocator : public SystemAllocator { public: virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu(); + virtual bool UseGpu() const; private: size_t gpu_alloc_size_ = 0; From d4017cadcd0fa07d8874e052ffa91700ebb32a05 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 11 Jul 2017 15:18:38 +0800 Subject: [PATCH 29/38] ENH: Add auto-free if allocate too much --- paddle/memory/detail/buddy_allocator.cc | 69 ++++++++++++++++++++++++- paddle/memory/detail/buddy_allocator.h | 3 ++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 3f630973e906c..27c1b4033b53b 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -152,7 +152,7 @@ void BuddyAllocator::Free(void* p) { IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); // Clean up if existing too much free memory - + // Prefer freeing fallback allocation first CleanIdleFallBackAlloc(); @@ -198,6 +198,12 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, max_chunk_size_, nullptr, nullptr); + // gpu fallback allocation + if (system_allocator_->UseGpu() && + static_cast(p)->index(cache_) == 1) { + fallback_alloc_count_++; + } + total_free_ += max_chunk_size_; // dump the block into pool @@ -256,9 +262,68 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, } void BuddyAllocator::CleanIdleFallBackAlloc() { - + // If fallback allocation does not exist, return directly + if (!fallback_alloc_count_) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + return; + } + + DLOG(INFO) << "Return block " << block << " to fallback allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + fallback_alloc_count_--; + + // If no fall allocation exists, return directly + if (!fallback_alloc_count_) return; + } } +void BuddyAllocator::CleanIdleNormalAlloc() { + auto shall_free_alloc = [&]() -> bool { + // free all fallback allocations + if (fallback_alloc_count_ > 0) { + return true; + } + // keep 2x overhead if we haven't fallen back + if ((total_used_ + max_chunk_size_) * 2 < total_free_) { + return true; + } + return false; + }; + + if (!shall_free_alloc()) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + DLOG(INFO) << "Return block " << block << " to base allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + + if (!shall_free_alloc()) return; + } +} } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 14ee1fa07c031..4fa3fb0ee5f82 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -94,6 +94,9 @@ class BuddyAllocator { */ PoolSet pool_; + /*! Record fallback allocation count for auto-scaling */ + size_t fallback_alloc_count_ = 0; + private: /*! Unify the metadata format between GPU and CPU allocations */ MetadataCache cache_; From 6a3b8416df124153d4a1fd1f8f559107578ed58e Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 11 Jul 2017 15:20:43 +0800 Subject: [PATCH 30/38] FIX: clang-format --- paddle/memory/memory_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index fed7444798fe2..9fdcd03b1a664 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -33,7 +33,7 @@ TEST(BuddyAllocator, CPUAllocation) { TEST(BuddyAllocator, CPUMultAlloc) { paddle::platform::CPUPlace cpu; - std::vector ps; + std::vector ps; ps.reserve(8); for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { From 383b96f32c60ec542819c62b4e09009cae9afc9d Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 11 Jul 2017 16:26:58 +0800 Subject: [PATCH 31/38] FIX: merge conflicts --- paddle/memory/detail/meta_cache.cc | 2 +- paddle/memory/memory.cc | 2 +- paddle/platform/CMakeLists.txt | 2 +- paddle/platform/device_context.h | 3 ++- paddle/platform/gpu_info.cc | 4 ++-- paddle/platform/gpu_info.h | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc index 189ab4fc7bb74..30ff80e7bac0b 100644 --- a/paddle/memory/detail/meta_cache.cc +++ b/paddle/memory/detail/meta_cache.cc @@ -25,7 +25,7 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} Metadata MetadataCache::load(const MemoryBlock* block) { if (uses_gpu_) { auto existing_metadata = cache_.find(block); - assert(existing_metadata->second.check_guards()); + PADDLE_ASSERT(existing_metadata->second.check_guards()); return existing_metadata->second; } else { PADDLE_ASSERT(reinterpret_cast(block)->check_guards()); diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index def580f7a4b22..430ce98bfc145 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -52,7 +52,7 @@ size_t Used(platform::CPUPlace place) { detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static detail::BuddyAllocator** as = NULL; if (as == NULL) { - int gpu_num = platform::GpuDeviceCount(); + int gpu_num = platform::GetDeviceCount(); as = new detail::BuddyAllocator*[gpu_num]; for (int gpu = 0; gpu < gpu_num; gpu++) { platform::SetDeviceId(gpu); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 4b3f55b3c7724..d16c747aee2f9 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -8,4 +8,4 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_library(dynamic_loader SRCS dynload/dynamic_loader.cc DEPS gflags glog) -nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3) +nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 gpu_info) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 160eb4e12060b..02194581d1dcb 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -16,10 +16,11 @@ limitations under the License. */ #include "paddle/framework/enforce.h" #ifndef PADDLE_ONLY_CPU -#include "paddle/platform/cuda.h" #include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/curand.h" +#include "paddle/platform/error.h" +#include "paddle/platform/gpu_info.h" #define EIGEN_USE_GPU #endif #include "paddle/platform/place.h" diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index fe475d23ce08b..9b917f9d35f7f 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -23,11 +23,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, namespace paddle { namespace platform { -int GpuDeviceCount() { +int GetDeviceCount() { int count; throw_on_error( cudaGetDeviceCount(&count), - "cudaGetDeviceCount failed in paddle::platform::GpuDeviceCount"); + "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount"); return count; } diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index 81ee5f6e0a95e..79e71956bd32e 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -22,7 +22,7 @@ namespace paddle { namespace platform { //! Get the total number of GPU devices in system. -int GpuDeviceCount(); +int GetDeviceCount(); //! Get the current GPU device id in system. int GetCurrentDeviceId(); From ff98e3c1ece983403ebdfa57f07d3bdf58f85647 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 13 Jul 2017 14:26:48 +0800 Subject: [PATCH 32/38] ENH: Remove comments --- paddle/memory/detail/system_allocator.h | 12 +++++------- paddle/platform/gpu_info.cc | 1 - 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 04efcd9709445..82ba322e05757 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -20,13 +20,11 @@ namespace paddle { namespace memory { namespace detail { -// SystemAllocator is the parent class of CPUAllocator and -// GPUAllocator. A BuddyAllocator object uses a SystemAllocator* -// pointing to the underlying system allocator. An alternative to -// this class hierarchy is to pass a system allocator class to -// BuddyAllocator as a template parameter. This approach makes -// BuddyAllocator a class template, and it's very complicated -// algorithm would make the buddy_allocator.h messy. +/** + * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator. + * A BuddyAllocator object uses a SystemAllocator* pointing to the + * underlying system allocator. + */ class SystemAllocator { public: virtual ~SystemAllocator() {} diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 9b917f9d35f7f..a1383d3524aed 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -65,7 +65,6 @@ size_t GpuMinChunkSize() { } size_t GpuMaxChunkSize() { - // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. size_t total = 0; size_t available = 0; From 00572aa451d44ccb32b1c59a59241d7000c68fda Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 13 Jul 2017 19:14:09 +0800 Subject: [PATCH 33/38] Add memory alignment test --- paddle/memory/memory_test.cc | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 9fdcd03b1a664..4c9b3311bb12c 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -13,9 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/memory.h" +#include "gtest/gtest.h" #include "paddle/platform/place.h" -#include "gtest/gtest.h" +template +inline bool is_aligned(T *p, size_t n = alignof(T)) { + return 0 == (reinterpret_cast(p) % n); +} TEST(BuddyAllocator, CPUAllocation) { void *p = nullptr; @@ -36,11 +40,13 @@ TEST(BuddyAllocator, CPUMultAlloc) { std::vector ps; ps.reserve(8); - for (auto size : {256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { ps.emplace_back(paddle::memory::Alloc(cpu, size)); } for (auto p : ps) { + EXPECT_EQ(is_aligned(p, 32), true); paddle::memory::Free(cpu, p); } } @@ -60,4 +66,21 @@ TEST(BuddyAllocator, GPUAllocation) { paddle::memory::Free(gpu, p); } +TEST(BuddyAllocator, GPUMultAlloc) { + paddle::platform::GPUPlace gpu; + + std::vector ps; + ps.reserve(8); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps.emplace_back(paddle::memory::Alloc(gpu, size)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p, 32), true); + paddle::memory::Free(gpu, p); + } +} + #endif // PADDLE_ONLY_CPU From ab5fe1e9071ef67850683442035f27c6c602e126 Mon Sep 17 00:00:00 2001 From: liaogang Date: Fri, 14 Jul 2017 11:52:03 +0800 Subject: [PATCH 34/38] ENH: memory test: check alignment and memory size --- paddle/memory/memory_test.cc | 80 ++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 4c9b3311bb12c..458c8b2e24fa2 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -13,14 +13,36 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/memory.h" -#include "gtest/gtest.h" +#include "paddle/memory/detail/memory_block.h" +#include "paddle/memory/detail/meta_data.h" + +#include "paddle/platform/cpu_info.h" +#include "paddle/platform/gpu_info.h" #include "paddle/platform/place.h" -template -inline bool is_aligned(T *p, size_t n = alignof(T)) { +#include +#include + +inline bool is_aligned(void const *p, const size_t n) { return 0 == (reinterpret_cast(p) % n); } +size_t align(size_t size, paddle::platform::CPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::CpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +size_t align(size_t size, paddle::platform::GPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::GpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void update_size(size_t &total_size, const size_t size) {} + TEST(BuddyAllocator, CPUAllocation) { void *p = nullptr; @@ -37,17 +59,33 @@ TEST(BuddyAllocator, CPUAllocation) { TEST(BuddyAllocator, CPUMultAlloc) { paddle::platform::CPUPlace cpu; - std::vector ps; - ps.reserve(8); + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(cpu); + EXPECT_EQ(total_size, 0UL); for (auto size : {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { - ps.emplace_back(paddle::memory::Alloc(cpu, size)); + ps[paddle::memory::Alloc(cpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(size, cpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); } for (auto p : ps) { - EXPECT_EQ(is_aligned(p, 32), true); - paddle::memory::Free(cpu, p); + EXPECT_EQ(is_aligned(p.first, 32), true); + paddle::memory::Free(cpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(p.second, cpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); } } @@ -69,17 +107,33 @@ TEST(BuddyAllocator, GPUAllocation) { TEST(BuddyAllocator, GPUMultAlloc) { paddle::platform::GPUPlace gpu; - std::vector ps; - ps.reserve(8); + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(gpu); + EXPECT_EQ(total_size, 0UL); for (auto size : {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { - ps.emplace_back(paddle::memory::Alloc(gpu, size)); + ps[paddle::memory::Alloc(gpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(size, gpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); } for (auto p : ps) { - EXPECT_EQ(is_aligned(p, 32), true); - paddle::memory::Free(gpu, p); + EXPECT_EQ(is_aligned(p.first, 32), true); + paddle::memory::Free(gpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(p.second, gpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); } } From 21b7915d9122d29bdb7506ab2e30049653ccf52a Mon Sep 17 00:00:00 2001 From: liaogang Date: Fri, 14 Jul 2017 12:03:09 +0800 Subject: [PATCH 35/38] Fix condition compile --- paddle/memory/memory_test.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 458c8b2e24fa2..e13cbabb268a8 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -34,13 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) { return remaining == 0 ? size : size + (alignment - remaining); } -size_t align(size_t size, paddle::platform::GPUPlace place) { - size += sizeof(paddle::memory::detail::Metadata); - size_t alignment = paddle::platform::GpuMinChunkSize(); - size_t remaining = size % alignment; - return remaining == 0 ? size : size + (alignment - remaining); -} - void update_size(size_t &total_size, const size_t size) {} TEST(BuddyAllocator, CPUAllocation) { @@ -91,6 +84,13 @@ TEST(BuddyAllocator, CPUMultAlloc) { #ifndef PADDLE_ONLY_CPU +size_t align(size_t size, paddle::platform::GPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::GpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + TEST(BuddyAllocator, GPUAllocation) { void *p = nullptr; From ea916c84874e4a036650bccb2b2eae142a6c36ae Mon Sep 17 00:00:00 2001 From: liaogang Date: Fri, 14 Jul 2017 13:00:03 +0800 Subject: [PATCH 36/38] Fix: alignment metric --- paddle/memory/memory_test.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index e13cbabb268a8..2a2cb98a31b63 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include inline bool is_aligned(void const *p, const size_t n) { - return 0 == (reinterpret_cast(p) % n); + return 0 == (reinterpret_cast(p) & 0x3); } size_t align(size_t size, paddle::platform::CPUPlace place) { @@ -34,8 +34,6 @@ size_t align(size_t size, paddle::platform::CPUPlace place) { return remaining == 0 ? size : size + (alignment - remaining); } -void update_size(size_t &total_size, const size_t size) {} - TEST(BuddyAllocator, CPUAllocation) { void *p = nullptr; From 033523ea9d01bb0e860dbbfd6c5bab72d2c5b149 Mon Sep 17 00:00:00 2001 From: liaogang Date: Fri, 14 Jul 2017 13:02:17 +0800 Subject: [PATCH 37/38] update --- paddle/memory/memory_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 2a2cb98a31b63..53cc63a098d08 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include #include -inline bool is_aligned(void const *p, const size_t n) { +inline bool is_aligned(void const *p) { return 0 == (reinterpret_cast(p) & 0x3); } @@ -68,7 +68,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { } for (auto p : ps) { - EXPECT_EQ(is_aligned(p.first, 32), true); + EXPECT_EQ(is_aligned(p.first), true); paddle::memory::Free(cpu, p.first); // Buddy Allocator doesn't manage too large memory chunk @@ -123,7 +123,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { } for (auto p : ps) { - EXPECT_EQ(is_aligned(p.first, 32), true); + EXPECT_EQ(is_aligned(p.first), true); paddle::memory::Free(gpu, p.first); // Buddy Allocator doesn't manage too large memory chunk From 03b3d0d8a8b8e90f997e1a2cec49bb04486adc8a Mon Sep 17 00:00:00 2001 From: liaogang Date: Fri, 14 Jul 2017 20:12:35 +0800 Subject: [PATCH 38/38] Follow comments --- paddle/platform/cpu_info.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc index 3da04420e57c3..1905cfeee6303 100644 --- a/paddle/platform/cpu_info.cc +++ b/paddle/platform/cpu_info.cc @@ -54,8 +54,8 @@ size_t CpuMaxAllocSize() { } size_t CpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; + // Allow to allocate the minimum chunk size is 4 KB. + return 1 << 12; } size_t CpuMaxChunkSize() {