-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Done] Memory Management: Buddy Allocator #2674
Changes from 45 commits
0e6ddcc
d3b77a5
b29923f
169022d
68ab1ef
e6c14f7
6e7209f
464886b
26cd0bb
fb51c3d
5ff172d
ec9e12a
275e5b7
89110fd
929f9cb
bbd3eab
4e1617d
ff36389
fb41350
379434b
0ba6347
4dc3c9e
d0ad031
ada1c20
7469178
936cd1e
5d2e8ed
3ad8e36
a669bf4
adf8c95
ddfa6cf
1ce2fca
199b5fc
49fd49f
d4017ca
6a3b841
f404282
383b96f
ff98e3c
00572aa
ab5fe1e
365b457
21b7915
ea916c8
033523e
03b3d0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,11 @@ | ||
add_subdirectory(detail) | ||
|
||
cc_library(memory SRCS memory.cc) | ||
|
||
cc_library(paddle_memory | ||
DEPS | ||
memory meta_data | ||
meta_cache memory_block | ||
buddy_allocator system_allocator) | ||
|
||
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,15 @@ | ||
if(${WITH_GPU}) | ||
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) | ||
nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) | ||
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) | ||
else(${WITH_GPU}) | ||
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) | ||
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) | ||
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) | ||
endif(${WITH_GPU}) | ||
|
||
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) | ||
|
||
cc_library(meta_data SRCS meta_data.cc) | ||
|
||
cc_library(meta_cache SRCS meta_cache.cc) | ||
|
||
cc_library(memory_block SRCS memory_block.cc) | ||
|
||
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,22 +12,317 @@ | |
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#pragma once | ||
|
||
#include "paddle/memory/detail/buddy_allocator.h" | ||
#include "glog/logging.h" | ||
|
||
namespace paddle { | ||
namespace memory { | ||
namespace detail { | ||
|
||
BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools, | ||
SystemAllocator* system_allocator) | ||
: pool_size_(pool_size), | ||
max_pools_(max_pools), | ||
system_allocator_(system_allocator) { | ||
PADDLE_ASSERT(pool_size > 0); | ||
PADDLE_ASSERT(max_pools > 0); | ||
PADDLE_ASSERT(system_allocator != nullptr); | ||
BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, | ||
size_t min_chunk_size, size_t max_chunk_size) | ||
: min_chunk_size_(min_chunk_size), | ||
max_chunk_size_(max_chunk_size), | ||
cache_(system_allocator->UseGpu()), | ||
system_allocator_(std::move(system_allocator)) {} | ||
|
||
BuddyAllocator::~BuddyAllocator() { | ||
DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these " | ||
"have actually been freed"; | ||
while (!pool_.empty()) { | ||
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin())); | ||
DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_ | ||
<< ")"; | ||
|
||
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); | ||
cache_.invalidate(block); | ||
pool_.erase(pool_.begin()); | ||
} | ||
} | ||
|
||
inline size_t align(size_t size, size_t alignment) { | ||
size_t remaining = size % alignment; | ||
return remaining == 0 ? size : size + (alignment - remaining); | ||
} | ||
|
||
void* BuddyAllocator::Alloc(size_t unaligned_size) { | ||
// adjust allocation alignment | ||
size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); | ||
|
||
// acquire the allocator lock | ||
std::lock_guard<std::mutex> lock(mutex_); | ||
|
||
DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " | ||
<< size; | ||
|
||
// if the allocation is huge, send directly to the system allocator | ||
if (size > max_chunk_size_) { | ||
DLOG(INFO) << "Allocate from system allocator."; | ||
return SystemAlloc(size); | ||
} | ||
|
||
// query and allocate from the existing chunk | ||
auto it = FindExistChunk(size); | ||
|
||
// refill the pool if failure | ||
if (it == pool_.end()) { | ||
it = RefillPool(); | ||
// if still failure, fail fatally | ||
if (it == pool_.end()) { | ||
return nullptr; | ||
} | ||
} else { | ||
DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it) | ||
<< " at address " | ||
<< reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data(); | ||
} | ||
|
||
total_used_ += size; | ||
total_free_ -= size; | ||
|
||
// split the allocation and return data for use | ||
return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data(); | ||
} | ||
|
||
void BuddyAllocator::Free(void* p) { | ||
// Point back to metadata | ||
auto block = static_cast<MemoryBlock*>(p)->metadata(); | ||
|
||
// Acquire the allocator lock | ||
std::lock_guard<std::mutex> lock(mutex_); | ||
|
||
DLOG(INFO) << "Free from address " << block; | ||
|
||
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { | ||
DLOG(INFO) << "Free directly from system allocator"; | ||
system_allocator_->Free(block, block->total_size(cache_), | ||
block->index(cache_)); | ||
|
||
// Invalidate GPU allocation from cache | ||
cache_.invalidate(block); | ||
|
||
return; | ||
} | ||
|
||
block->mark_as_free(cache_); | ||
|
||
total_used_ -= block->total_size(cache_); | ||
total_free_ += block->total_size(cache_); | ||
|
||
// Trying to merge the right buddy | ||
if (block->has_right_buddy(cache_)) { | ||
DLOG(INFO) << "Merging this block " << block << " with its right buddy " | ||
<< block->right_buddy(cache_); | ||
|
||
auto right_buddy = block->right_buddy(cache_); | ||
|
||
if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { | ||
// Take away right buddy from pool | ||
pool_.erase(IndexSizeAddress(right_buddy->index(cache_), | ||
right_buddy->total_size(cache_), | ||
right_buddy)); | ||
|
||
// merge its right buddy to the block | ||
block->merge(cache_, right_buddy); | ||
} | ||
} | ||
|
||
// Trying to merge the left buddy | ||
if (block->has_left_buddy(cache_)) { | ||
DLOG(INFO) << "Merging this block " << block << " with its left buddy " | ||
<< block->left_buddy(cache_); | ||
|
||
auto left_buddy = block->left_buddy(cache_); | ||
|
||
if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { | ||
// Take away right buddy from pool | ||
pool_.erase(IndexSizeAddress(left_buddy->index(cache_), | ||
left_buddy->total_size(cache_), left_buddy)); | ||
|
||
// merge the block to its left buddy | ||
left_buddy->merge(cache_, block); | ||
block = left_buddy; | ||
} | ||
} | ||
|
||
// Dumping this block into pool | ||
DLOG(INFO) << "Inserting free block (" << block << ", " | ||
<< block->total_size(cache_) << ")"; | ||
pool_.insert( | ||
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); | ||
|
||
// Clean up if existing too much free memory | ||
|
||
// Prefer freeing fallback allocation first | ||
CleanIdleFallBackAlloc(); | ||
|
||
// Free normal allocation | ||
CleanIdleNormalAlloc(); | ||
} | ||
|
||
size_t BuddyAllocator::Used() { return total_used_; } | ||
|
||
void* BuddyAllocator::SystemAlloc(size_t size) { | ||
size_t index = 0; | ||
void* p = system_allocator_->Alloc(index, size); | ||
|
||
DLOG(INFO) << "Allocated " << p << " from system allocator."; | ||
|
||
if (p == nullptr) return nullptr; | ||
|
||
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, | ||
size, nullptr, nullptr); | ||
|
||
return static_cast<MemoryBlock*>(p)->data(); | ||
} | ||
|
||
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { | ||
#ifndef PADDLE_ONLY_CPU | ||
if (system_allocator_->UseGpu()) { | ||
if ((total_used_ + total_free_) == 0) { | ||
// Compute the maximum allocation size for the first allocation. | ||
max_chunk_size_ = platform::GpuMaxChunkSize(); | ||
} | ||
} | ||
#endif // PADDLE_ONLY_CPU | ||
|
||
// Allocate a new maximum sized block | ||
size_t index = 0; | ||
void* p = system_allocator_->Alloc(index, max_chunk_size_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we allocate more than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, we can allocate chunk size bigger than the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well I mean
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two situations in here. For GPU, it's bad to specify size_t GpuMaxChunkSize() {
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(available, total);
// Reserving the rest memory for page tables, etc.
size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
// If available less than minimum chunk size, no usable memory exists.
available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
// If available less than reserving, no usable memory exists.
size_t usable = std::max(available, reserving) - reserving;
return usable;
} For CPU, again, too large memory chunk should not be managed by Buddy allocator, it‘s one-time usage. size_t CpuMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
return CpuMaxAllocSize() / 32;
} For 16GB node, 3% means roughly 500 MB, I think it's good enough.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great explanation! I totally agree with you! Maybe minimum chunk size of 4K is best for performance because default linux memory page size is 4K. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @typhoonzero That's a good idea. But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably for CPU, using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, only for CPU. |
||
|
||
if (p == nullptr) return pool_.end(); | ||
|
||
DLOG(INFO) << "Creating and inserting new block " << p | ||
<< " from system allocator"; | ||
|
||
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, | ||
max_chunk_size_, nullptr, nullptr); | ||
|
||
// gpu fallback allocation | ||
if (system_allocator_->UseGpu() && | ||
static_cast<MemoryBlock*>(p)->index(cache_) == 1) { | ||
fallback_alloc_count_++; | ||
} | ||
|
||
total_free_ += max_chunk_size_; | ||
|
||
// dump the block into pool | ||
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; | ||
} | ||
|
||
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { | ||
size_t index = 0; | ||
|
||
while (1) { | ||
auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); | ||
|
||
// no match chunk memory | ||
if (it == pool_.end()) return it; | ||
|
||
if (std::get<0>(*it) > index) { | ||
// find suitable one | ||
if (std::get<1>(*it) >= size) { | ||
return it; | ||
} | ||
// update and continue | ||
index = std::get<0>(*it); | ||
continue; | ||
} | ||
return it; | ||
} | ||
} | ||
|
||
void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, | ||
size_t size) { | ||
auto block = static_cast<MemoryBlock*>(std::get<2>(*it)); | ||
pool_.erase(it); | ||
|
||
DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_) | ||
<< ") into"; | ||
block->split(cache_, size); | ||
|
||
DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_) | ||
<< ")"; | ||
block->set_type(cache_, MemoryBlock::ARENA_CHUNK); | ||
|
||
// the rest of memory if exist | ||
if (block->has_right_buddy(cache_)) { | ||
if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { | ||
DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", " | ||
<< block->right_buddy(cache_)->total_size(cache_) << ")"; | ||
|
||
pool_.insert( | ||
IndexSizeAddress(block->right_buddy(cache_)->index(cache_), | ||
block->right_buddy(cache_)->total_size(cache_), | ||
block->right_buddy(cache_))); | ||
} | ||
} | ||
|
||
return block; | ||
} | ||
|
||
void BuddyAllocator::CleanIdleFallBackAlloc() { | ||
// If fallback allocation does not exist, return directly | ||
if (!fallback_alloc_count_) return; | ||
|
||
for (auto pool = pool_.rbegin(); pool != pool_.rend();) { | ||
// If free memory block less than max_chunk_size_, return directly | ||
if (std::get<1>(*pool) < max_chunk_size_) return; | ||
|
||
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool)); | ||
|
||
// If no GPU fallback allocator, return | ||
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { | ||
return; | ||
} | ||
|
||
DLOG(INFO) << "Return block " << block << " to fallback allocator."; | ||
|
||
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); | ||
cache_.invalidate(block); | ||
|
||
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); | ||
|
||
total_free_ -= max_chunk_size_; | ||
fallback_alloc_count_--; | ||
|
||
// If no fall allocation exists, return directly | ||
if (!fallback_alloc_count_) return; | ||
} | ||
} | ||
|
||
void BuddyAllocator::CleanIdleNormalAlloc() { | ||
auto shall_free_alloc = [&]() -> bool { | ||
// free all fallback allocations | ||
if (fallback_alloc_count_ > 0) { | ||
return true; | ||
} | ||
// keep 2x overhead if we haven't fallen back | ||
if ((total_used_ + max_chunk_size_) * 2 < total_free_) { | ||
return true; | ||
} | ||
return false; | ||
}; | ||
|
||
if (!shall_free_alloc()) return; | ||
|
||
for (auto pool = pool_.rbegin(); pool != pool_.rend();) { | ||
// If free memory block less than max_chunk_size_, return directly | ||
if (std::get<1>(*pool) < max_chunk_size_) return; | ||
|
||
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool)); | ||
|
||
DLOG(INFO) << "Return block " << block << " to base allocator."; | ||
|
||
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); | ||
cache_.invalidate(block); | ||
|
||
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); | ||
|
||
total_free_ -= max_chunk_size_; | ||
|
||
if (!shall_free_alloc()) return; | ||
} | ||
} | ||
|
||
} // namespace detail | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there multiple instances of
BuddyAllocator
in one trainer?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/PaddlePaddle/Paddle/pull/2674/files#diff-8d7a07775123d20061c8100dd8ed402dR52
BuddyAlloctor is singleton, it belongs to each GPU/CPU
@typhoonzero