Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] optimize filter_range when avx512f available #14328

Merged
merged 5 commits into from Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 53 additions & 5 deletions be/src/column/column_helper.h
Expand Up @@ -17,6 +17,7 @@
#include "column/type_traits.h"
#include "gutil/bits.h"
#include "gutil/casts.h"
#include "gutil/cpu.h"
#include "runtime/primitive_type.h"
#include "util/phmap/phmap.h"

Expand Down Expand Up @@ -311,8 +312,8 @@ class ColumnHelper {
using ColumnsConstIterator = Columns::const_iterator;
static bool is_all_const(ColumnsConstIterator const& begin, ColumnsConstIterator const& end);
static size_t compute_bytes_size(ColumnsConstIterator const& begin, ColumnsConstIterator const& end);
template <typename T>
static size_t filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) {
template <typename T, bool avx512f>
static size_t t_filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) {
auto start_offset = from;
auto result_offset = from;

Expand All @@ -337,9 +338,47 @@ class ColumnHelper {
result_offset += kBatchNums;

} else {
phmap::priv::BitMask<uint32_t, 32> bitmask(mask);
for (auto idx : bitmask) {
*(data + result_offset++) = *(data + start_offset + idx);
#define AVX512_COPY(SHIFT, MASK, WIDTH) \
{ \
auto m = (mask >> SHIFT) & MASK; \
if (m) { \
__m512i dst; \
__m512i src = _mm512_loadu_epi##WIDTH(data + start_offset + SHIFT); \
dst = _mm512_mask_compress_epi##WIDTH(dst, m, src); \
_mm512_storeu_epi##WIDTH(data + result_offset, dst); \
result_offset += __builtin_popcount(m); \
} \
}

// In theory we should put k1 in clobbers.
// But since we compile code with AVX2, k1 register is not used.
#define AVX512_ASM_COPY(SHIFT, MASK, WIDTH, WIDTHX) \
{ \
auto m = (mask >> SHIFT) & MASK; \
if (m) { \
T* src = data + start_offset + SHIFT; \
T* dst = data + result_offset; \
__asm__ volatile("vmovdqu" #WIDTH \
" (%[s]), %%zmm1\n" \
"kmovw %[mask], %%k1\n" \
"vpcompress" #WIDTHX \
" %%zmm1, %%zmm0%{%%k1%}%{z%}\n" \
"vmovdqu" #WIDTH " %%zmm0, (%[d])\n" \
: [ s ] "+r"(src), [ d ] "+r"(dst) \
: [ mask ] "r"(m) \
: "zmm0", "zmm1", "memory"); \
result_offset += __builtin_popcount(m); \
} \
}

if constexpr (avx512f && sizeof(T) == 4) {
AVX512_ASM_COPY(0, 0xffff, 32, d);
AVX512_ASM_COPY(16, 0xffff, 32, d);
} else {
phmap::priv::BitMask<uint32_t, 32> bitmask(mask);
for (auto idx : bitmask) {
*(data + result_offset++) = *(data + start_offset + idx);
}
}
}

Expand Down Expand Up @@ -386,6 +425,15 @@ class ColumnHelper {
return result_offset;
}

template <typename T>
static size_t filter_range(const Column::Filter& filter, T* data, size_t from, size_t to) {
if (base::CPU::instance()->has_avx512f()) {
return t_filter_range<T, true>(filter, data, from, to);
} else {
return t_filter_range<T, false>(filter, data, from, to);
}
}

template <typename T>
static size_t filter(const Column::Filter& filter, T* data) {
return filter_range(filter, data, 0, filter.size());
Expand Down
11 changes: 11 additions & 0 deletions be/src/gutil/cpu.cc
Expand Up @@ -214,6 +214,11 @@ void CPU::Initialize() {
has_non_stop_time_stamp_counter_ = true;
}
}
// https://gcc.gnu.org/onlinedocs/gcc/x86-Built-in-Functions.html
__builtin_cpu_init();
if (__builtin_cpu_supports("avx512f")) {
has_avx512f_ = true;
}
#elif defined(ARCH_CPU_ARM_FAMILY)
#if (defined(OS_ANDROID) || defined(OS_LINUX))
cpu_brand_ = *CpuInfoBrand();
Expand All @@ -235,4 +240,10 @@ CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const {
if (has_sse()) return SSE;
return PENTIUM;
}

CPU _cpu_global_instance;
const CPU* CPU::instance() {
return &_cpu_global_instance;
}

} // namespace base
4 changes: 4 additions & 0 deletions be/src/gutil/cpu.h
Expand Up @@ -83,10 +83,12 @@ class CPU final {
bool has_avx() const { return has_avx_; }
bool has_avx2() const { return has_avx2_; }
bool has_aesni() const { return has_aesni_; }
bool has_avx512f() const { return has_avx512f_; }
bool has_non_stop_time_stamp_counter() const { return has_non_stop_time_stamp_counter_; }
bool is_running_in_vm() const { return is_running_in_vm_; }
IntelMicroArchitecture GetIntelMicroArchitecture() const;
const std::string& cpu_brand() const { return cpu_brand_; }
static const CPU* instance();

private:
// Query the processor for CPUID information.
Expand All @@ -109,9 +111,11 @@ class CPU final {
bool has_avx_{false};
bool has_avx2_{false};
bool has_aesni_{false};
bool has_avx512f_{false};
bool has_non_stop_time_stamp_counter_{false};
bool is_running_in_vm_{false};
std::string cpu_vendor_;
std::string cpu_brand_;
};

} // namespace base
3 changes: 2 additions & 1 deletion be/test/exec/vectorized/json_parser_test.cpp
Expand Up @@ -448,7 +448,8 @@ PARALLEL_TEST(JsonParserTest, test_illegal_json_array) {
PARALLEL_TEST(JsonParserTest, test_big_value) {
simdjson::ondemand::parser simdjson_parser;
// The padded_string would allocate memory with simdjson::SIMDJSON_PADDING bytes padding.
simdjson::padded_string input = simdjson::padded_string::load("./be/test/exec/test_data/json_scanner/big_value.json");
simdjson::padded_string input =
simdjson::padded_string::load("./be/test/exec/test_data/json_scanner/big_value.json");

std::unique_ptr<JsonParser> parser(new JsonDocumentStreamParser(&simdjson_parser));

Expand Down
2 changes: 1 addition & 1 deletion be/test/formats/orc/orc_chunk_reader_test.cpp
Expand Up @@ -1095,7 +1095,7 @@ TEST_F(OrcChunkReaderTest, TestReadArrayDecimal) {
type_array.children.emplace_back(TypeDescriptor::create_decimalv3_type(TYPE_DECIMAL64, 9, 9));

SlotDesc slot_descs[] = {
{"id", TypeDescriptor::from_primtive_type(LogicalType::TYPE_INT)},
{"id", TypeDescriptor::from_primtive_type(LogicalType::TYPE_INT)},
{"arr", type_array},
{""},
};
Expand Down