diff --git a/be/src/column/column_helper.h b/be/src/column/column_helper.h index 894640085bdbe..d5e45c57252ac 100644 --- a/be/src/column/column_helper.h +++ b/be/src/column/column_helper.h @@ -17,6 +17,7 @@ #include "column/type_traits.h" #include "gutil/bits.h" #include "gutil/casts.h" +#include "gutil/cpu.h" #include "runtime/primitive_type.h" #include "util/phmap/phmap.h" @@ -311,8 +312,8 @@ class ColumnHelper { using ColumnsConstIterator = Columns::const_iterator; static bool is_all_const(ColumnsConstIterator const& begin, ColumnsConstIterator const& end); static size_t compute_bytes_size(ColumnsConstIterator const& begin, ColumnsConstIterator const& end); - template - static size_t filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) { + template + static size_t t_filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) { auto start_offset = from; auto result_offset = from; @@ -337,9 +338,47 @@ class ColumnHelper { result_offset += kBatchNums; } else { - phmap::priv::BitMask bitmask(mask); - for (auto idx : bitmask) { - *(data + result_offset++) = *(data + start_offset + idx); +#define AVX512_COPY(SHIFT, MASK, WIDTH) \ + { \ + auto m = (mask >> SHIFT) & MASK; \ + if (m) { \ + __m512i dst; \ + __m512i src = _mm512_loadu_epi##WIDTH(data + start_offset + SHIFT); \ + dst = _mm512_mask_compress_epi##WIDTH(dst, m, src); \ + _mm512_storeu_epi##WIDTH(data + result_offset, dst); \ + result_offset += __builtin_popcount(m); \ + } \ + } + +// In theory we should put k1 in clobbers. +// But since we compile code with AVX2, k1 register is not used. +#define AVX512_ASM_COPY(SHIFT, MASK, WIDTH, WIDTHX) \ + { \ + auto m = (mask >> SHIFT) & MASK; \ + if (m) { \ + T* src = data + start_offset + SHIFT; \ + T* dst = data + result_offset; \ + __asm__ volatile("vmovdqu" #WIDTH \ + " (%[s]), %%zmm1\n" \ + "kmovw %[mask], %%k1\n" \ + "vpcompress" #WIDTHX \ + " %%zmm1, %%zmm0%{%%k1%}%{z%}\n" \ + "vmovdqu" #WIDTH " %%zmm0, (%[d])\n" \ + : [ s ] "+r"(src), [ d ] "+r"(dst) \ + : [ mask ] "r"(m) \ + : "zmm0", "zmm1", "memory"); \ + result_offset += __builtin_popcount(m); \ + } \ + } + + if constexpr (avx512f && sizeof(T) == 4) { + AVX512_ASM_COPY(0, 0xffff, 32, d); + AVX512_ASM_COPY(16, 0xffff, 32, d); + } else { + phmap::priv::BitMask bitmask(mask); + for (auto idx : bitmask) { + *(data + result_offset++) = *(data + start_offset + idx); + } } } @@ -386,6 +425,15 @@ class ColumnHelper { return result_offset; } + template + static size_t filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) { + if (base::CPU::instance()->has_avx512f()) { + return t_filter_range(filter, data, from, to); + } else { + return t_filter_range(filter, data, from, to); + } + } + template static size_t filter(const Column::Filter& filter, T* data) { return filter_range(filter, data, 0, filter.size()); diff --git a/be/src/gutil/cpu.cc b/be/src/gutil/cpu.cc index 7b8855248a248..cb52d5ade87f3 100644 --- a/be/src/gutil/cpu.cc +++ b/be/src/gutil/cpu.cc @@ -214,6 +214,11 @@ void CPU::Initialize() { has_non_stop_time_stamp_counter_ = true; } } + // https://gcc.gnu.org/onlinedocs/gcc/x86-Built-in-Functions.html + __builtin_cpu_init(); + if (__builtin_cpu_supports("avx512f")) { + has_avx512f_ = true; + } #elif defined(ARCH_CPU_ARM_FAMILY) #if (defined(OS_ANDROID) || defined(OS_LINUX)) cpu_brand_ = *CpuInfoBrand(); @@ -235,4 +240,10 @@ CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { if (has_sse()) return SSE; return PENTIUM; } + +CPU _cpu_global_instance; +const CPU* CPU::instance() { + return &_cpu_global_instance; +} + } // namespace base diff --git a/be/src/gutil/cpu.h b/be/src/gutil/cpu.h index a002da1d526ad..28eb207b00476 100644 --- a/be/src/gutil/cpu.h +++ b/be/src/gutil/cpu.h @@ -83,10 +83,12 @@ class CPU final { bool has_avx() const { return has_avx_; } bool has_avx2() const { return has_avx2_; } bool has_aesni() const { return has_aesni_; } + bool has_avx512f() const { return has_avx512f_; } bool has_non_stop_time_stamp_counter() const { return has_non_stop_time_stamp_counter_; } bool is_running_in_vm() const { return is_running_in_vm_; } IntelMicroArchitecture GetIntelMicroArchitecture() const; const std::string& cpu_brand() const { return cpu_brand_; } + static const CPU* instance(); private: // Query the processor for CPUID information. @@ -109,9 +111,11 @@ class CPU final { bool has_avx_{false}; bool has_avx2_{false}; bool has_aesni_{false}; + bool has_avx512f_{false}; bool has_non_stop_time_stamp_counter_{false}; bool is_running_in_vm_{false}; std::string cpu_vendor_; std::string cpu_brand_; }; + } // namespace base diff --git a/be/test/exec/vectorized/json_parser_test.cpp b/be/test/exec/vectorized/json_parser_test.cpp index d0f9d9ea72238..90cc99a6b0168 100644 --- a/be/test/exec/vectorized/json_parser_test.cpp +++ b/be/test/exec/vectorized/json_parser_test.cpp @@ -448,7 +448,8 @@ PARALLEL_TEST(JsonParserTest, test_illegal_json_array) { PARALLEL_TEST(JsonParserTest, test_big_value) { simdjson::ondemand::parser simdjson_parser; // The padded_string would allocate memory with simdjson::SIMDJSON_PADDING bytes padding. - simdjson::padded_string input = simdjson::padded_string::load("./be/test/exec/test_data/json_scanner/big_value.json"); + simdjson::padded_string input = + simdjson::padded_string::load("./be/test/exec/test_data/json_scanner/big_value.json"); std::unique_ptr parser(new JsonDocumentStreamParser(&simdjson_parser)); diff --git a/be/test/formats/orc/orc_chunk_reader_test.cpp b/be/test/formats/orc/orc_chunk_reader_test.cpp index b7acf59ee7195..d1af9c6e025c0 100644 --- a/be/test/formats/orc/orc_chunk_reader_test.cpp +++ b/be/test/formats/orc/orc_chunk_reader_test.cpp @@ -1095,7 +1095,7 @@ TEST_F(OrcChunkReaderTest, TestReadArrayDecimal) { type_array.children.emplace_back(TypeDescriptor::create_decimalv3_type(TYPE_DECIMAL64, 9, 9)); SlotDesc slot_descs[] = { - {"id", TypeDescriptor::from_primtive_type(LogicalType::TYPE_INT)}, + {"id", TypeDescriptor::from_primtive_type(LogicalType::TYPE_INT)}, {"arr", type_array}, {""}, };