Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Optimize a subtle inline performance problem #23300

Merged
merged 2 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/common/compiler_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
/// needs to be inlined for a specific reason or the compiler's heuristics make a bad
/// decision, e.g. not inlining a small function on a hot path.
#define ALWAYS_INLINE __attribute__((always_inline))
#define ALWAYS_NOINLINE __attribute__((noinline))

#define ALIGN_CACHE_LINE __attribute__((aligned(CACHE_LINE_SIZE)))

Expand Down
224 changes: 122 additions & 102 deletions be/src/exec/aggregate/agg_hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ struct AggHashMapWithOneNumberKeyWithNullable

// Non Nullble
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
DCHECK(!key_columns[0]->is_nullable());
auto column = down_cast<ColumnType*>(key_columns[0].get());

Expand All @@ -196,8 +196,9 @@ struct AggHashMapWithOneNumberKeyWithNullable

// Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
// Assign not_founds vector when needs compute not founds.
if constexpr (compute_not_founds) {
DCHECK(not_founds);
Expand All @@ -215,7 +216,6 @@ struct AggHashMapWithOneNumberKeyWithNullable
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* data_column = down_cast<ColumnType*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();

// Shortcut: if nullable column has no nulls.
if (!nullable_column->has_null()) {
Expand All @@ -226,31 +226,17 @@ struct AggHashMapWithOneNumberKeyWithNullable
this->template compute_agg_prefetch<Func, allocate_and_compute_state, compute_not_founds>(
data_column, agg_states, std::forward<Func>(allocate_func), not_founds);
}
return;
}

for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
} else {
this->template compute_agg_through_null_data<Func, allocate_and_compute_state, compute_not_founds>(
chunk_size, nullable_column, agg_states, std::forward<Func>(allocate_func), not_founds);
}
}
}

// prefetch branch better performance in case with larger hash tables
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
AGG_HASH_MAP_PRECOMPUTE_HASH_VALUES(column, AGG_HASH_MAP_DEFAULT_PREFETCH_DIST);
for (size_t i = 0; i < column_size; i++) {
AGG_HASH_MAP_PREFETCH_HASH_VALUE();
Expand Down Expand Up @@ -280,8 +266,8 @@ struct AggHashMapWithOneNumberKeyWithNullable

// prefetch branch better performance in case with small hash tables
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; i++) {
FieldType key = column->get_data()[i];
Expand All @@ -306,6 +292,29 @@ struct AggHashMapWithOneNumberKeyWithNullable
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
ALWAYS_NOINLINE void compute_agg_through_null_data(size_t chunk_size, NullableColumn* nullable_column,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
auto* data_column = down_cast<ColumnType*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(ColumnType* data_column, size_t row, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
Expand Down Expand Up @@ -381,9 +390,9 @@ struct AggHashMapWithOneStringKeyWithNullable

// Non Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
DCHECK(key_columns[0]->is_binary());
auto column = down_cast<BinaryColumn*>(key_columns[0].get());

Expand All @@ -404,8 +413,9 @@ struct AggHashMapWithOneStringKeyWithNullable

// Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
// Assign not_founds vector when needs compute not founds.
if constexpr (compute_not_founds) {
DCHECK(not_founds);
Expand All @@ -423,7 +433,6 @@ struct AggHashMapWithOneStringKeyWithNullable
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* data_column = down_cast<BinaryColumn*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
DCHECK(data_column->is_binary());

if (!nullable_column->has_null()) {
Expand All @@ -434,74 +443,16 @@ struct AggHashMapWithOneStringKeyWithNullable
this->template compute_agg_prefetch<Func, allocate_and_compute_state, compute_not_founds>(
data_column, agg_states, pool, std::forward<Func>(allocate_func), not_founds);
}
return;
}

for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, pool, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
DCHECK(not_founds);
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(BinaryColumn* data_column, size_t row, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
if constexpr (compute_not_founds) {
(*not_founds)[row] = 1;
} else {
this->template compute_agg_through_null_data<Func, allocate_and_compute_state, compute_not_founds>(
chunk_size, nullable_column, agg_states, pool, std::forward<Func>(allocate_func), not_founds);
}
uint8_t* pos = pool->allocate(key.size);
strings::memcpy_inlined(pos, key.data, key.size);
Slice pk{pos, key.size};
AggDataPtr pv = allocate_func(pk);
ctor(pk, pv);
});
(*agg_states)[row] = iter->second;
}

void _handle_data_key_column_without_allocate(BinaryColumn* data_column, size_t row, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
if (auto iter = this->hash_map.find(key); iter != this->hash_map.end()) {
(*agg_states)[row] = iter->second;
} else {
(*not_founds)[row] = 1;
}
}

void insert_keys_to_columns(ResultVector& keys, const Columns& key_columns, size_t chunk_size) {
if constexpr (is_nullable) {
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* column = down_cast<BinaryColumn*>(nullable_column->mutable_data_column());
keys.resize(chunk_size);
column->append_strings(keys);
nullable_column->null_column_data().resize(chunk_size);
} else {
DCHECK(!null_key_data);
auto* column = down_cast<BinaryColumn*>(key_columns[0].get());
keys.resize(chunk_size);
column->append_strings(keys);
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
AGG_HASH_MAP_PRECOMPUTE_HASH_VALUES(column, AGG_HASH_MAP_DEFAULT_PREFETCH_DIST);
for (size_t i = 0; i < column_size; i++) {
AGG_HASH_MAP_PREFETCH_HASH_VALUE();
Expand Down Expand Up @@ -531,8 +482,8 @@ struct AggHashMapWithOneStringKeyWithNullable
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; i++) {
auto key = column->get_slice(i);
Expand Down Expand Up @@ -560,6 +511,73 @@ struct AggHashMapWithOneStringKeyWithNullable
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
ALWAYS_NOINLINE void compute_agg_through_null_data(size_t chunk_size, NullableColumn* nullable_column,
Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
auto* data_column = down_cast<BinaryColumn*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, pool, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
DCHECK(not_founds);
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(BinaryColumn* data_column, size_t row, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
if constexpr (compute_not_founds) {
(*not_founds)[row] = 1;
}
uint8_t* pos = pool->allocate(key.size);
strings::memcpy_inlined(pos, key.data, key.size);
Slice pk{pos, key.size};
AggDataPtr pv = allocate_func(pk);
ctor(pk, pv);
});
(*agg_states)[row] = iter->second;
}

void _handle_data_key_column_without_allocate(BinaryColumn* data_column, size_t row, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
if (auto iter = this->hash_map.find(key); iter != this->hash_map.end()) {
(*agg_states)[row] = iter->second;
} else {
(*not_founds)[row] = 1;
}
}

void insert_keys_to_columns(ResultVector& keys, const Columns& key_columns, size_t chunk_size) {
if constexpr (is_nullable) {
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* column = down_cast<BinaryColumn*>(nullable_column->mutable_data_column());
keys.resize(chunk_size);
column->append_strings(keys);
nullable_column->null_column_data().resize(chunk_size);
} else {
DCHECK(!null_key_data);
auto* column = down_cast<BinaryColumn*>(key_columns[0].get());
keys.resize(chunk_size);
column->append_strings(keys);
}
}

static constexpr bool has_single_null_key = is_nullable;

AggDataPtr null_key_data = nullptr;
Expand Down Expand Up @@ -712,8 +730,9 @@ struct AggHashMapWithSerializedKeyFixedSize
AggDataPtr get_null_key_data() { return nullptr; }

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
auto* buffer = reinterpret_cast<uint8_t*>(caches.data());
for (const auto& key_column : key_columns) {
key_column->serialize_batch(buffer, slice_sizes, chunk_size, max_fixed_size);
Expand Down Expand Up @@ -755,8 +774,9 @@ struct AggHashMapWithSerializedKeyFixedSize
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
constexpr int key_size = sizeof(FixedSizeSliceKey);
auto* buffer = reinterpret_cast<uint8_t*>(caches.data());
for (const auto& key_column : key_columns) {
Expand Down