Skip to content

Commit

Permalink
[Enhancement] Improve performance of strings::memcpy_inlined (#13330)
Browse files Browse the repository at this point in the history
This PR is to improve memcpy to accelerate block cache:
- use overlapped move to optimize the case when size <= 16
- use avx(256bit reg) move to optimize the case when size <= 256
- use erms(enhanced repeat movsb/stosb) to optimize the case when size in [512KB,2MB]
- use unrolled avx(256bit reg) move to optimze the rest cases.
  • Loading branch information
dirtysalt committed Nov 17, 2022
1 parent 2ab3607 commit d940e8a
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 65 deletions.
8 changes: 6 additions & 2 deletions be/src/block_cache/fb_cachelib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "common/logging.h"
#include "common/statusor.h"
#include "gutil/strings/fastmem.h"
#include "util/filesystem_util.h"

namespace starrocks {
Expand Down Expand Up @@ -60,7 +61,8 @@ Status FbCacheLib::write_cache(const std::string& key, const char* value, size_t
if (!handle) {
return Status::InternalError("allocate cachelib item failed");
}
std::memcpy(handle->getMemory(), value, size);
// std::memcpy(handle->getMemory(), value, size);
strings::memcpy_inlined(handle->getMemory(), value, size);
_cache->insertOrReplace(handle);
return Status::OK();
}
Expand All @@ -74,7 +76,9 @@ StatusOr<size_t> FbCacheLib::read_cache(const std::string& key, char* value, siz
return Status::NotFound("not found cachelib item");
}
DCHECK((off + size) <= handle->getSize());
std::memcpy(value, (char*)handle->getMemory() + off, size);
// std::memcpy(value, (char*)handle->getMemory() + off, size);
strings::memcpy_inlined(value, (char*)handle->getMemory() + off, size);

if (handle->hasChainedItem()) {
}
return size;
Expand Down
151 changes: 89 additions & 62 deletions be/src/gutil/strings/fastmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

#pragma once

#include <emmintrin.h>
#include <immintrin.h>

#include <cstddef>
#include <cstdint>
#include <cstdio>
Expand Down Expand Up @@ -94,68 +97,92 @@ inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n)
return 0;
}

// The standard memcpy operation is slow for variable small sizes.
// This implementation inlines the optimal realization for sizes 1 to 16.
// To avoid code bloat don't use it in case of not performance-critical spots,
// nor when you don't expect very frequent values of size <= 16.
inline void memcpy_inlined(void* dst, const void* src, size_t size) {
// Compiler inlines code with minimal amount of data movement when third
// parameter of memcpy is a constant.
switch (size) {
case 0:
break;
case 1:
memcpy(dst, src, 1);
break;
case 2:
memcpy(dst, src, 2);
break;
case 3:
memcpy(dst, src, 3);
break;
case 4:
memcpy(dst, src, 4);
break;
case 5:
memcpy(dst, src, 5);
break;
case 6:
memcpy(dst, src, 6);
break;
case 7:
memcpy(dst, src, 7);
break;
case 8:
memcpy(dst, src, 8);
break;
case 9:
memcpy(dst, src, 9);
break;
case 10:
memcpy(dst, src, 10);
break;
case 11:
memcpy(dst, src, 11);
break;
case 12:
memcpy(dst, src, 12);
break;
case 13:
memcpy(dst, src, 13);
break;
case 14:
memcpy(dst, src, 14);
break;
case 15:
memcpy(dst, src, 15);
break;
case 16:
memcpy(dst, src, 16);
break;
default:
memcpy(dst, src, size);
break;
inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, size_t size) {
auto dst = static_cast<uint8_t*>(_dst);
auto src = static_cast<const uint8_t*>(_src);

[[maybe_unused]] tail : if (size <= 16) {
if (size >= 8) {
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
} else if (size >= 4) {
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
} else if (size >= 2) {
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
} else if (size >= 1) {
*dst = *src;
}
}
}
else {
#ifdef __AVX2__
if (size <= 256) {
if (size <= 32) {
__builtin_memcpy(dst, src, 8);
__builtin_memcpy(dst + 8, src + 8, 8);
size -= 16;
dst += 16;
src += 16;
goto tail;
}

while (size > 32) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
dst += 32;
src += 32;
size -= 32;
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + size - 32),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
} else {
static constexpr size_t KB = 1024;
if (size >= 512 * KB && size <= 2048 * KB) {
// erms(enhanced repeat movsv/stosb) version works well in this region.
asm volatile("rep movsb" : "=D"(dst), "=S"(src), "=c"(size) : "0"(dst), "1"(src), "2"(size) : "memory");
} else {
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;

if (padding > 0) {
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}

while (size >= 256) {
__m256i c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
__m256i c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
__m256i c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
__m256i c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
__m256i c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
__m256i c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
__m256i c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
__m256i c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
src += 256;

_mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
dst += 256;

size -= 256;
}

goto tail;
}
}
#else
std::memcpy(dst, src, size);
#endif
}
}
} // namespace strings
4 changes: 3 additions & 1 deletion be/src/io/cache_input_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <utility>

#include "block_cache/block_cache.h"
#include "gutil/strings/fastmem.h"
#include "util/hash_util.hpp"
#include "util/runtime_profile.h"
#include "util/stack_util.h"
Expand Down Expand Up @@ -84,7 +85,8 @@ StatusOr<int64_t> CacheInputStream::read(void* out, int64_t count) {
}

if (!can_zero_copy) {
memcpy(p, src + shift, size);
// memcpy(p, src + shift, size);
strings::memcpy_inlined(p, src + shift, size);
_stats.read_cache_bytes += size;
}
p += size;
Expand Down

0 comments on commit d940e8a

Please sign in to comment.