Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Done] Memory Management: Buddy Allocator #2674

Merged
merged 46 commits into from
Jul 14, 2017
Merged
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
0e6ddcc
ENH: Add GPU throw error
gangliao Jun 29, 2017
d3b77a5
ENH: Add Gpu info
gangliao Jun 29, 2017
b29923f
ENH: Add CPU info
gangliao Jun 29, 2017
169022d
FIX: Improve fallback gpu allocator
gangliao Jun 29, 2017
68ab1ef
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
gangliao Jun 29, 2017
e6c14f7
ENH: Polish cpu info interface
gangliao Jun 29, 2017
6e7209f
ENH: Add gpu info interface
gangliao Jun 29, 2017
464886b
FIX: fix typo in piece.h
gangliao Jun 29, 2017
26cd0bb
ENH: count allocated fallback size for performance
gangliao Jun 29, 2017
fb51c3d
FIX: add compile dependency gflags
gangliao Jun 29, 2017
5ff172d
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
gangliao Jun 30, 2017
ec9e12a
Merge remote-tracking branch 'paddlepaddle/develop' into cpu_mem
gangliao Jul 3, 2017
275e5b7
FIX: yapf format version
gangliao Jul 3, 2017
89110fd
ENH: Add useGpu in system allocator
gangliao Jul 3, 2017
929f9cb
ENH: Add Metadata for memory block
gangliao Jul 3, 2017
bbd3eab
ENH: Add Alloc for buddy Allocator
gangliao Jul 3, 2017
4e1617d
ENH: add buddy alloctor Free
gangliao Jul 4, 2017
ff36389
ENH: code style
gangliao Jul 4, 2017
fb41350
Merge conflict
gangliao Jul 4, 2017
379434b
Delete cmake in dynload
gangliao Jul 4, 2017
0ba6347
ENH: Add buddy allocator Free
gangliao Jul 4, 2017
4dc3c9e
ENH: Add paddle_memory for external usage
gangliao Jul 4, 2017
d0ad031
FIX: glog dependency
gangliao Jul 5, 2017
ada1c20
FIX: Buddy Allocator Free with Merge feature
gangliao Jul 5, 2017
7469178
ENH: add memory unit test
gangliao Jul 5, 2017
936cd1e
FIX: code format
gangliao Jul 5, 2017
5d2e8ed
FIX: dynamic loader deps
gangliao Jul 5, 2017
3ad8e36
FIX: merge static libs with propagation dependencies
gangliao Jul 6, 2017
a669bf4
FIX: explicit construct pool element
gangliao Jul 6, 2017
adf8c95
FIX: propagation dependencies under linux
gangliao Jul 6, 2017
ddfa6cf
FIX: remove boost from memory folder
gangliao Jul 6, 2017
1ce2fca
Merge conflicts
gangliao Jul 10, 2017
199b5fc
ENH: refine code comments
gangliao Jul 10, 2017
49fd49f
Fix conflicts
gangliao Jul 11, 2017
d4017ca
ENH: Add auto-free if allocate too much
gangliao Jul 11, 2017
6a3b841
FIX: clang-format
gangliao Jul 11, 2017
f404282
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
gangliao Jul 11, 2017
383b96f
FIX: merge conflicts
gangliao Jul 11, 2017
ff98e3c
ENH: Remove comments
gangliao Jul 13, 2017
00572aa
Add memory alignment test
gangliao Jul 13, 2017
ab5fe1e
ENH: memory test: check alignment and memory size
gangliao Jul 14, 2017
365b457
Merge conflicts
gangliao Jul 14, 2017
21b7915
Fix condition compile
gangliao Jul 14, 2017
ea916c8
Fix: alignment metric
gangliao Jul 14, 2017
033523e
update
gangliao Jul 14, 2017
03b3d0d
Follow comments
gangliao Jul 14, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs})
list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach()
list(REMOVE_DUPLICATES libs_deps)

if(APPLE) # Use OSX's libtool to merge archives
# To produce a library we need at least one source file.
Expand All @@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
# Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
Expand All @@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${lib} ${objdir}
WORKING_DIRECTORY ${objdir})

# Empty dummy source file that goes into merged library
set(mergebase ${lib}.mergebase.c)
add_custom_command(OUTPUT ${mergebase}
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
DEPENDS ${objlistfile})
# Empty dummy source file that goes into merged library
set(mergebase ${lib}.mergebase.c)
add_custom_command(OUTPUT ${mergebase}
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
DEPENDS ${objlistfile})

list(APPEND mergebases "${mergebase}")
endforeach()
Expand Down
10 changes: 10 additions & 0 deletions paddle/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
add_subdirectory(detail)

cc_library(memory SRCS memory.cc)

cc_library(paddle_memory
DEPS
memory meta_data
meta_cache memory_block
buddy_allocator system_allocator)

cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
16 changes: 12 additions & 4 deletions paddle/memory/detail/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
else(${WITH_GPU})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
endif(${WITH_GPU})

cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)

cc_library(meta_data SRCS meta_data.cc)

cc_library(meta_cache SRCS meta_cache.cc)

cc_library(memory_block SRCS memory_block.cc)

cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
315 changes: 305 additions & 10 deletions paddle/memory/detail/buddy_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,317 @@
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/memory/detail/buddy_allocator.h"
#include "glog/logging.h"

namespace paddle {
namespace memory {
namespace detail {

BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
SystemAllocator* system_allocator)
: pool_size_(pool_size),
max_pools_(max_pools),
system_allocator_(system_allocator) {
PADDLE_ASSERT(pool_size > 0);
PADDLE_ASSERT(max_pools > 0);
PADDLE_ASSERT(system_allocator != nullptr);
BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
size_t min_chunk_size, size_t max_chunk_size)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}

BuddyAllocator::~BuddyAllocator() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there multiple instances of BuddyAllocator in one trainer?

Copy link
Contributor Author

@gangliao gangliao Jul 11, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
<< ")";

system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
}

inline size_t align(size_t size, size_t alignment) {
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}

void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment
size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);

// acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);

DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
<< size;

// if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) {
DLOG(INFO) << "Allocate from system allocator.";
return SystemAlloc(size);
}

// query and allocate from the existing chunk
auto it = FindExistChunk(size);

// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
// if still failure, fail fatally
if (it == pool_.end()) {
return nullptr;
}
} else {
DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
<< " at address "
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
}

total_used_ += size;
total_free_ -= size;

// split the allocation and return data for use
return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
}

void BuddyAllocator::Free(void* p) {
// Point back to metadata
auto block = static_cast<MemoryBlock*>(p)->metadata();

// Acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);

DLOG(INFO) << "Free from address " << block;

if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
DLOG(INFO) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_));

// Invalidate GPU allocation from cache
cache_.invalidate(block);

return;
}

block->mark_as_free(cache_);

total_used_ -= block->total_size(cache_);
total_free_ += block->total_size(cache_);

// Trying to merge the right buddy
if (block->has_right_buddy(cache_)) {
DLOG(INFO) << "Merging this block " << block << " with its right buddy "
<< block->right_buddy(cache_);

auto right_buddy = block->right_buddy(cache_);

if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
// Take away right buddy from pool
pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
right_buddy->total_size(cache_),
right_buddy));

// merge its right buddy to the block
block->merge(cache_, right_buddy);
}
}

// Trying to merge the left buddy
if (block->has_left_buddy(cache_)) {
DLOG(INFO) << "Merging this block " << block << " with its left buddy "
<< block->left_buddy(cache_);

auto left_buddy = block->left_buddy(cache_);

if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
// Take away right buddy from pool
pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
left_buddy->total_size(cache_), left_buddy));

// merge the block to its left buddy
left_buddy->merge(cache_, block);
block = left_buddy;
}
}

// Dumping this block into pool
DLOG(INFO) << "Inserting free block (" << block << ", "
<< block->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));

// Clean up if existing too much free memory

// Prefer freeing fallback allocation first
CleanIdleFallBackAlloc();

// Free normal allocation
CleanIdleNormalAlloc();
}

size_t BuddyAllocator::Used() { return total_used_; }

void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0;
void* p = system_allocator_->Alloc(index, size);

DLOG(INFO) << "Allocated " << p << " from system allocator.";

if (p == nullptr) return nullptr;

static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr);

return static_cast<MemoryBlock*>(p)->data();
}

BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
#ifndef PADDLE_ONLY_CPU
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
}
}
#endif // PADDLE_ONLY_CPU

// Allocate a new maximum sized block
size_t index = 0;
void* p = system_allocator_->Alloc(index, max_chunk_size_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we allocate more than max_chunk_size_ if there's not enough in the pool_, so that allocated memory are continues, introducing less memory fragments. Or I don't know if max_chunk_size_ could be like 1G to do that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we can allocate chunk size bigger than the max_chunk_size_, but it will not be managed by buddy allocator. You can chek this line: https://github.com/PaddlePaddle/Paddle/pull/2674/files#diff-dd894d330dd6a0deb01afe3fe24b1752R59

@typhoonzero

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well I mean

  1. shall we set max_chunk_size_ >= 1G so that alloc ops after will be faster.
  2. or shall we alocate 10 * max_chunk_size_ in RefillPool for performance.

Copy link
Contributor Author

@gangliao gangliao Jul 13, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two situations in here.

For GPU, it's bad to specify max_chunk_size >= 1G or 10 * max_chunk_size_ . It's better to set max_chunk_size_ according the current device's resouce.

size_t GpuMaxChunkSize() {
  size_t total = 0;
  size_t available = 0;

  GpuMemoryUsage(available, total);

  // Reserving the rest memory for page tables, etc.
  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;

  // If available less than minimum chunk size, no usable memory exists.
  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();

  // If available less than reserving, no usable memory exists.
  size_t usable = std::max(available, reserving) - reserving;

  return usable;
}

For CPU, again, too large memory chunk should not be managed by Buddy allocator, it‘s one-time usage.

size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}

size_t CpuMinChunkSize() {
  // Allow to allocate the minimum chunk size is 256 bytes.
  return 1 << 8;
}

size_t CpuMaxChunkSize() {
  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
  return CpuMaxAllocSize() / 32;
}

For 16GB node, 3% means roughly 500 MB, I think it's good enough.

FLAGS_fraction_of_cpu_memory_to_use is to expose to kubernetes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great explanation! I totally agree with you!

Maybe minimum chunk size of 4K is best for performance because default linux memory page size is 4K.

Copy link
Contributor Author

@gangliao gangliao Jul 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@typhoonzero That's a good idea. But 4k means 4096 bytes -> 1024 floats,
if we frequently allocate small chunks, like 256, 128, 32, 64 floats, any of them will be padding to 4K, is that waste memory?

Copy link
Contributor Author

@gangliao gangliao Jul 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably for CPU, using 4k. For GPU, maybe default 4k is not a good idea.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, only for CPU.


if (p == nullptr) return pool_.end();

DLOG(INFO) << "Creating and inserting new block " << p
<< " from system allocator";

static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr);

// gpu fallback allocation
if (system_allocator_->UseGpu() &&
static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
fallback_alloc_count_++;
}

total_free_ += max_chunk_size_;

// dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
}

BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
size_t index = 0;

while (1) {
auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));

// no match chunk memory
if (it == pool_.end()) return it;

if (std::get<0>(*it) > index) {
// find suitable one
if (std::get<1>(*it) >= size) {
return it;
}
// update and continue
index = std::get<0>(*it);
continue;
}
return it;
}
}

void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
size_t size) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
pool_.erase(it);

DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into";
block->split(cache_, size);

DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")";
block->set_type(cache_, MemoryBlock::ARENA_CHUNK);

// the rest of memory if exist
if (block->has_right_buddy(cache_)) {
if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
<< block->right_buddy(cache_)->total_size(cache_) << ")";

pool_.insert(
IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
block->right_buddy(cache_)->total_size(cache_),
block->right_buddy(cache_)));
}
}

return block;
}

void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly
if (!fallback_alloc_count_) return;

for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;

MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

// If no GPU fallback allocator, return
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
return;
}

DLOG(INFO) << "Return block " << block << " to fallback allocator.";

system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);

pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

total_free_ -= max_chunk_size_;
fallback_alloc_count_--;

// If no fall allocation exists, return directly
if (!fallback_alloc_count_) return;
}
}

void BuddyAllocator::CleanIdleNormalAlloc() {
auto shall_free_alloc = [&]() -> bool {
// free all fallback allocations
if (fallback_alloc_count_ > 0) {
return true;
}
// keep 2x overhead if we haven't fallen back
if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
return true;
}
return false;
};

if (!shall_free_alloc()) return;

for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;

MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

DLOG(INFO) << "Return block " << block << " to base allocator.";

system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);

pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

total_free_ -= max_chunk_size_;

if (!shall_free_alloc()) return;
}
}

} // namespace detail
Expand Down
Loading