Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Improve performance of strings::memcpy_inlined #13330

Merged
merged 4 commits into from Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions be/src/block_cache/fb_cachelib.cpp
Expand Up @@ -4,6 +4,7 @@

#include "common/logging.h"
#include "common/statusor.h"
#include "gutil/strings/fastmem.h"
#include "util/filesystem_util.h"

namespace starrocks {
Expand Down Expand Up @@ -60,7 +61,8 @@ Status FbCacheLib::write_cache(const std::string& key, const char* value, size_t
if (!handle) {
return Status::InternalError("allocate cachelib item failed");
}
std::memcpy(handle->getMemory(), value, size);
// std::memcpy(handle->getMemory(), value, size);
strings::memcpy_inlined(handle->getMemory(), value, size);
_cache->insertOrReplace(handle);
return Status::OK();
}
Expand All @@ -74,7 +76,9 @@ StatusOr<size_t> FbCacheLib::read_cache(const std::string& key, char* value, siz
return Status::NotFound("not found cachelib item");
}
DCHECK((off + size) <= handle->getSize());
std::memcpy(value, (char*)handle->getMemory() + off, size);
// std::memcpy(value, (char*)handle->getMemory() + off, size);
strings::memcpy_inlined(value, (char*)handle->getMemory() + off, size);

if (handle->hasChainedItem()) {
}
return size;
Expand Down
151 changes: 89 additions & 62 deletions be/src/gutil/strings/fastmem.h
Expand Up @@ -16,6 +16,9 @@

#pragma once

#include <emmintrin.h>
#include <immintrin.h>

#include <cstddef>
#include <cstdint>
#include <cstdio>
Expand Down Expand Up @@ -94,68 +97,92 @@ inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n)
return 0;
}

// The standard memcpy operation is slow for variable small sizes.
// This implementation inlines the optimal realization for sizes 1 to 16.
// To avoid code bloat don't use it in case of not performance-critical spots,
// nor when you don't expect very frequent values of size <= 16.
inline void memcpy_inlined(void* dst, const void* src, size_t size) {
// Compiler inlines code with minimal amount of data movement when third
// parameter of memcpy is a constant.
switch (size) {
case 0:
break;
case 1:
memcpy(dst, src, 1);
break;
case 2:
memcpy(dst, src, 2);
break;
case 3:
memcpy(dst, src, 3);
break;
case 4:
memcpy(dst, src, 4);
break;
case 5:
memcpy(dst, src, 5);
break;
case 6:
memcpy(dst, src, 6);
break;
case 7:
memcpy(dst, src, 7);
break;
case 8:
memcpy(dst, src, 8);
break;
case 9:
memcpy(dst, src, 9);
break;
case 10:
memcpy(dst, src, 10);
break;
case 11:
memcpy(dst, src, 11);
break;
case 12:
memcpy(dst, src, 12);
break;
case 13:
memcpy(dst, src, 13);
break;
case 14:
memcpy(dst, src, 14);
break;
case 15:
memcpy(dst, src, 15);
break;
case 16:
memcpy(dst, src, 16);
break;
default:
memcpy(dst, src, size);
break;
inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, size_t size) {
auto dst = static_cast<uint8_t*>(_dst);
auto src = static_cast<const uint8_t*>(_src);

[[maybe_unused]] tail : if (size <= 16) {
if (size >= 8) {
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
} else if (size >= 4) {
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
} else if (size >= 2) {
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
} else if (size >= 1) {
*dst = *src;
}
}
}
else {
#ifdef __AVX2__
if (size <= 256) {
if (size <= 32) {
__builtin_memcpy(dst, src, 8);
__builtin_memcpy(dst + 8, src + 8, 8);
size -= 16;
dst += 16;
src += 16;
goto tail;
}

while (size > 32) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
dst += 32;
src += 32;
size -= 32;
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + size - 32),
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
} else {
static constexpr size_t KB = 1024;
if (size >= 512 * KB && size <= 2048 * KB) {
// erms(enhanced repeat movsv/stosb) version works well in this region.
asm volatile("rep movsb" : "=D"(dst), "=S"(src), "=c"(size) : "0"(dst), "1"(src), "2"(size) : "memory");
} else {
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;

if (padding > 0) {
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}

while (size >= 256) {
__m256i c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
__m256i c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
__m256i c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
__m256i c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
__m256i c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
__m256i c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
__m256i c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
__m256i c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
src += 256;

_mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
dst += 256;

size -= 256;
}

goto tail;
}
}
#else
std::memcpy(dst, src, size);
#endif
}
}
} // namespace strings
4 changes: 3 additions & 1 deletion be/src/io/cache_input_stream.cpp
Expand Up @@ -6,6 +6,7 @@
#include <utility>

#include "block_cache/block_cache.h"
#include "gutil/strings/fastmem.h"
#include "util/hash_util.hpp"
#include "util/runtime_profile.h"
#include "util/stack_util.h"
Expand Down Expand Up @@ -84,7 +85,8 @@ StatusOr<int64_t> CacheInputStream::read(void* out, int64_t count) {
}

if (!can_zero_copy) {
memcpy(p, src + shift, size);
// memcpy(p, src + shift, size);
strings::memcpy_inlined(p, src + shift, size);
_stats.read_cache_bytes += size;
}
p += size;
Expand Down