diff --git a/be/src/block_cache/fb_cachelib.cpp b/be/src/block_cache/fb_cachelib.cpp index 66edbeb443d93..23ce8a98cb0d5 100644 --- a/be/src/block_cache/fb_cachelib.cpp +++ b/be/src/block_cache/fb_cachelib.cpp @@ -4,6 +4,7 @@ #include "common/logging.h" #include "common/statusor.h" +#include "gutil/strings/fastmem.h" #include "util/filesystem_util.h" namespace starrocks { @@ -60,7 +61,8 @@ Status FbCacheLib::write_cache(const std::string& key, const char* value, size_t if (!handle) { return Status::InternalError("allocate cachelib item failed"); } - std::memcpy(handle->getMemory(), value, size); + // std::memcpy(handle->getMemory(), value, size); + strings::memcpy_inlined(handle->getMemory(), value, size); _cache->insertOrReplace(handle); return Status::OK(); } @@ -74,7 +76,9 @@ StatusOr FbCacheLib::read_cache(const std::string& key, char* value, siz return Status::NotFound("not found cachelib item"); } DCHECK((off + size) <= handle->getSize()); - std::memcpy(value, (char*)handle->getMemory() + off, size); + // std::memcpy(value, (char*)handle->getMemory() + off, size); + strings::memcpy_inlined(value, (char*)handle->getMemory() + off, size); + if (handle->hasChainedItem()) { } return size; diff --git a/be/src/gutil/strings/fastmem.h b/be/src/gutil/strings/fastmem.h index 49aaea4b79fec..3050de3f6355c 100644 --- a/be/src/gutil/strings/fastmem.h +++ b/be/src/gutil/strings/fastmem.h @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include #include #include @@ -94,68 +97,92 @@ inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) return 0; } -// The standard memcpy operation is slow for variable small sizes. -// This implementation inlines the optimal realization for sizes 1 to 16. -// To avoid code bloat don't use it in case of not performance-critical spots, -// nor when you don't expect very frequent values of size <= 16. -inline void memcpy_inlined(void* dst, const void* src, size_t size) { - // Compiler inlines code with minimal amount of data movement when third - // parameter of memcpy is a constant. - switch (size) { - case 0: - break; - case 1: - memcpy(dst, src, 1); - break; - case 2: - memcpy(dst, src, 2); - break; - case 3: - memcpy(dst, src, 3); - break; - case 4: - memcpy(dst, src, 4); - break; - case 5: - memcpy(dst, src, 5); - break; - case 6: - memcpy(dst, src, 6); - break; - case 7: - memcpy(dst, src, 7); - break; - case 8: - memcpy(dst, src, 8); - break; - case 9: - memcpy(dst, src, 9); - break; - case 10: - memcpy(dst, src, 10); - break; - case 11: - memcpy(dst, src, 11); - break; - case 12: - memcpy(dst, src, 12); - break; - case 13: - memcpy(dst, src, 13); - break; - case 14: - memcpy(dst, src, 14); - break; - case 15: - memcpy(dst, src, 15); - break; - case 16: - memcpy(dst, src, 16); - break; - default: - memcpy(dst, src, size); - break; +inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, size_t size) { + auto dst = static_cast(_dst); + auto src = static_cast(_src); + + [[maybe_unused]] tail : if (size <= 16) { + if (size >= 8) { + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } else if (size >= 4) { + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } else if (size >= 2) { + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } else if (size >= 1) { + *dst = *src; + } } -} + else { +#ifdef __AVX2__ + if (size <= 256) { + if (size <= 32) { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + 8, src + 8, 8); + size -= 16; + dst += 16; + src += 16; + goto tail; + } + + while (size > 32) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), + _mm256_loadu_si256(reinterpret_cast(src))); + dst += 32; + src += 32; + size -= 32; + } + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + size - 32), + _mm256_loadu_si256(reinterpret_cast(src + size - 32))); + } else { + static constexpr size_t KB = 1024; + if (size >= 512 * KB && size <= 2048 * KB) { + // erms(enhanced repeat movsv/stosb) version works well in this region. + asm volatile("rep movsb" : "=D"(dst), "=S"(src), "=c"(size) : "0"(dst), "1"(src), "2"(size) : "memory"); + } else { + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + while (size >= 256) { + __m256i c0 = _mm256_loadu_si256(reinterpret_cast(src)); + __m256i c1 = _mm256_loadu_si256(reinterpret_cast(src + 32)); + __m256i c2 = _mm256_loadu_si256(reinterpret_cast(src + 64)); + __m256i c3 = _mm256_loadu_si256(reinterpret_cast(src + 96)); + __m256i c4 = _mm256_loadu_si256(reinterpret_cast(src + 128)); + __m256i c5 = _mm256_loadu_si256(reinterpret_cast(src + 160)); + __m256i c6 = _mm256_loadu_si256(reinterpret_cast(src + 192)); + __m256i c7 = _mm256_loadu_si256(reinterpret_cast(src + 224)); + src += 256; + + _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6); + _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7); + dst += 256; + + size -= 256; + } + + goto tail; + } + } +#else + std::memcpy(dst, src, size); +#endif + } +} } // namespace strings diff --git a/be/src/io/cache_input_stream.cpp b/be/src/io/cache_input_stream.cpp index ff63b9f0e8c95..e33d0e2bec187 100644 --- a/be/src/io/cache_input_stream.cpp +++ b/be/src/io/cache_input_stream.cpp @@ -6,6 +6,7 @@ #include #include "block_cache/block_cache.h" +#include "gutil/strings/fastmem.h" #include "util/hash_util.hpp" #include "util/runtime_profile.h" #include "util/stack_util.h" @@ -84,7 +85,8 @@ StatusOr CacheInputStream::read(void* out, int64_t count) { } if (!can_zero_copy) { - memcpy(p, src + shift, size); + // memcpy(p, src + shift, size); + strings::memcpy_inlined(p, src + shift, size); _stats.read_cache_bytes += size; } p += size;