Skip to content

Commit

Permalink
apacheGH-36641: [C++] Remove reference to acero from non-acero file (a…
Browse files Browse the repository at this point in the history
…pache#36650)

### Rationale for this change

Files in modules which do not depend on the acero module should not reference files inside the acero module.

### What changes are included in this PR?

There were no changes to the body of any functions.  I simply moved functions around so that the acero include was no longer needed.  There were some conflicts that arose between the class `bit_util` and the namespace `bit_util` and so I got rid of the class in favor of the namespace as that is more similar to how we handle `bit_util` elsewhere.

### Are these changes tested?

Sort of.  I would like to add an AVX2 CI system as well.  I'm not confident any of the CI builds are building with AVX2 enabled.  Also, even if we have an AVX2 CI system it would not have caught this issue since the code was only needed definitions from the acero header and was not relying on any actual compiled symbols.  However, I think setting up tests to catch this sort of invalid include are beyond the scope of this PR.

### Are there any user-facing changes?

No.
* Closes: apache#36641

Lead-authored-by: Weston Pace <weston.pace@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
2 people authored and R-JunmingChen committed Aug 20, 2023
1 parent cc7c76b commit df63f97
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 119 deletions.
70 changes: 35 additions & 35 deletions cpp/src/arrow/compute/util.cc
Expand Up @@ -56,7 +56,9 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
--num_vectors_;
}

inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
namespace bit_util {

inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
// This will not be correct on big-endian architectures.
#if !ARROW_LITTLE_ENDIAN
ARROW_DCHECK(false);
Expand All @@ -73,7 +75,7 @@ inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes
}
}

inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
inline void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
// This will not be correct on big-endian architectures.
#if !ARROW_LITTLE_ENDIAN
ARROW_DCHECK(false);
Expand All @@ -88,8 +90,8 @@ inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_
}
}

inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
int* num_indexes, uint16_t* indexes) {
inline void bits_to_indexes_helper(uint64_t word, uint16_t base_index, int* num_indexes,
uint16_t* indexes) {
int n = *num_indexes;
while (word) {
indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
Expand All @@ -98,9 +100,8 @@ inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
*num_indexes = n;
}

inline void bit_util::bits_filter_indexes_helper(uint64_t word,
const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes) {
inline void bits_filter_indexes_helper(uint64_t word, const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes) {
int n = *num_indexes;
while (word) {
indexes[n++] = input_indexes[CountTrailingZeros(word)];
Expand All @@ -110,21 +111,21 @@ inline void bit_util::bits_filter_indexes_helper(uint64_t word,
}

template <int bit_to_search, bool filter_input_indexes>
void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes, uint16_t base_index) {
void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes,
uint16_t base_index = 0) {
// 64 bits at a time
constexpr int unroll = 64;
int tail = num_bits % unroll;
#if defined(ARROW_HAVE_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
if (filter_input_indexes) {
bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
num_indexes, indexes);
avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
num_indexes, indexes);
} else {
bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes,
base_index);
avx2::bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes,
indexes, base_index);
}
} else {
#endif
Expand Down Expand Up @@ -160,9 +161,9 @@ void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bi
}
}

void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
const uint8_t* bits, int* num_indexes, uint16_t* indexes,
int bit_offset) {
void bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
const uint8_t* bits, int* num_indexes, uint16_t* indexes,
int bit_offset) {
bits += bit_offset / 8;
bit_offset %= 8;
*num_indexes = 0;
Expand Down Expand Up @@ -193,10 +194,9 @@ void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int nu
*num_indexes += num_indexes_new;
}

void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes, int bit_offset) {
void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits,
const uint8_t* bits, const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes, int bit_offset) {
bits += bit_offset / 8;
bit_offset %= 8;
if (bit_offset != 0) {
Expand Down Expand Up @@ -226,19 +226,18 @@ void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
}
}

void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, int* num_indexes_bit0,
uint16_t* indexes_bit0, uint16_t* indexes_bit1,
int bit_offset) {
void bits_split_indexes(int64_t hardware_flags, const int num_bits, const uint8_t* bits,
int* num_indexes_bit0, uint16_t* indexes_bit0,
uint16_t* indexes_bit1, int bit_offset) {
bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
bit_offset);
int num_indexes_bit1;
bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
bit_offset);
}

void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, uint8_t* bytes, int bit_offset) {
void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bits,
uint8_t* bytes, int bit_offset) {
bits += bit_offset / 8;
bit_offset %= 8;
if (bit_offset != 0) {
Expand All @@ -258,7 +257,7 @@ void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
bits_to_bytes_avx2(num_processed, bits, bytes);
avx2::bits_to_bytes_avx2(num_processed, bits, bytes);
}
#endif
// Processing 8 bits at a time
Expand Down Expand Up @@ -290,8 +289,8 @@ void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
}
}

void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
const uint8_t* bytes, uint8_t* bits, int bit_offset) {
void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* bytes,
uint8_t* bits, int bit_offset) {
bits += bit_offset / 8;
bit_offset %= 8;
if (bit_offset != 0) {
Expand All @@ -314,7 +313,7 @@ void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
// The function call below processes whole 32 bit chunks together.
num_processed = num_bits - (num_bits % 32);
bytes_to_bits_avx2(num_processed, bytes, bits);
avx2::bytes_to_bits_avx2(num_processed, bytes, bits);
}
#endif
// Process 8 bits at a time
Expand All @@ -338,11 +337,11 @@ void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
}
}

bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes) {
bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes) {
#if defined(ARROW_HAVE_AVX2)
if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
return are_all_bytes_zero_avx2(bytes, num_bytes);
return avx2::are_all_bytes_zero_avx2(bytes, num_bytes);
}
#endif
uint64_t result_or = 0;
Expand All @@ -358,6 +357,7 @@ bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
return result_or == 0;
}

} // namespace bit_util
} // namespace util

} // namespace arrow
86 changes: 36 additions & 50 deletions cpp/src/arrow/compute/util.h
Expand Up @@ -139,69 +139,55 @@ class TempVectorHolder {
uint32_t num_elements_;
};

class ARROW_EXPORT bit_util {
public:
static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits, int* num_indexes,
uint16_t* indexes, int bit_offset = 0);
namespace bit_util {

static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes, int bit_offset = 0);
int* num_indexes, uint16_t* indexes,
int bit_offset = 0);

// Input and output indexes may be pointing to the same data (in-place filtering).
static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, int* num_indexes_bit0,
uint16_t* indexes_bit0, uint16_t* indexes_bit1,
int bit_offset = 0);
ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes, int bit_offset = 0);

// Bit 1 is replaced with byte 0xFF.
static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
// Input and output indexes may be pointing to the same data (in-place filtering).
ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, int* num_indexes_bit0,
uint16_t* indexes_bit0, uint16_t* indexes_bit1,
int bit_offset = 0);

// Return highest bit of each byte.
static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
// Bit 1 is replaced with byte 0xFF.
ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);

static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes);
// Return highest bit of each byte.
ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);

private:
inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
int* num_indexes, uint16_t* indexes);
inline static void bits_filter_indexes_helper(uint64_t word,
const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes);
template <int bit_to_search, bool filter_input_indexes>
static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
const uint8_t* bits, const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes,
uint16_t base_index = 0);
ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
uint32_t num_bytes);

#if defined(ARROW_HAVE_AVX2)
static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits, int* num_indexes,
uint16_t* indexes, uint16_t base_index = 0);
static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits, const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes);
template <int bit_to_search>
static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
int* num_indexes, uint16_t* indexes,
uint16_t base_index = 0);
template <int bit_to_search>
static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,

namespace avx2 {
ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits,
const uint16_t* input_indexes,
int* num_indexes, uint16_t* indexes);
static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits, int* num_indexes,
uint16_t* indexes, uint16_t base_index = 0);
ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
uint8_t* bytes);
ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
uint8_t* bits);
ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
} // namespace avx2

#endif
};

} // namespace bit_util
} // namespace util

namespace compute {
Expand Down
62 changes: 28 additions & 34 deletions cpp/src/arrow/compute/util_avx2.cc
Expand Up @@ -16,30 +16,18 @@
// under the License.

#include <immintrin.h>
#include <cstring>

#include "arrow/acero/util.h"
#include "arrow/util/bit_util.h"

namespace arrow {
namespace util {
#include "arrow/util/logging.h"

#if defined(ARROW_HAVE_AVX2)

void bit_util::bits_to_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits, int* num_indexes,
uint16_t* indexes, uint16_t base_index) {
if (bit_to_search == 0) {
bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index);
} else {
ARROW_DCHECK(bit_to_search == 1);
bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes, base_index);
}
}
namespace arrow::util::avx2 {

template <int bit_to_search>
void bit_util::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
int* num_indexes, uint16_t* indexes,
uint16_t base_index) {
void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_indexes,
uint16_t* indexes, uint16_t base_index = 0) {
// 64 bits at a time
constexpr int unroll = 64;

Expand Down Expand Up @@ -82,21 +70,20 @@ void bit_util::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
}
}

void bit_util::bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes) {
void bits_to_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits,
int* num_indexes, uint16_t* indexes, uint16_t base_index) {
if (bit_to_search == 0) {
bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes);
bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index);
} else {
bits_filter_indexes_imp_avx2<1>(num_bits, bits, input_indexes, num_indexes, indexes);
ARROW_DCHECK(bit_to_search == 1);
bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes, base_index);
}
}

template <int bit_to_search>
void bit_util::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes,
int* out_num_indexes, uint16_t* indexes) {
void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes, int* out_num_indexes,
uint16_t* indexes) {
// 64 bits at a time
constexpr int unroll = 64;

Expand Down Expand Up @@ -167,8 +154,17 @@ void bit_util::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* b
*out_num_indexes = num_indexes;
}

void bit_util::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
uint8_t* bytes) {
void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits,
const uint16_t* input_indexes, int* num_indexes,
uint16_t* indexes) {
if (bit_to_search == 0) {
bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes);
} else {
bits_filter_indexes_imp_avx2<1>(num_bits, bits, input_indexes, num_indexes, indexes);
}
}

void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes) {
constexpr int unroll = 32;

constexpr uint64_t kEachByteIs1 = 0x0101010101010101ULL;
Expand All @@ -188,8 +184,7 @@ void bit_util::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
}
}

void bit_util::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
uint8_t* bits) {
void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits) {
constexpr int unroll = 32;
// Processing 32 bits at a time
for (int i = 0; i < num_bits / unroll; ++i) {
Expand All @@ -198,7 +193,7 @@ void bit_util::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
}
}

bool bit_util::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) {
bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) {
__m256i result_or = _mm256_setzero_si256();
uint32_t i;
for (i = 0; i < num_bytes / 32; ++i) {
Expand All @@ -216,7 +211,6 @@ bool bit_util::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes)
return result_or32 == 0;
}

#endif // ARROW_HAVE_AVX2
} // namespace arrow::util::avx2

} // namespace util
} // namespace arrow
#endif // ARROW_HAVE_AVX2

0 comments on commit df63f97

Please sign in to comment.