Skip to content

Commit

Permalink
[JSC] Use simde for find16,32,64 functions
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=273849
rdar://127701822

Reviewed by Mark Lam.

This patch starts using simde for find16 / find32 / find64 implementations.
And we also use a slightly different approach so that we no longer need to access to out-of-ASAN region (but still within page boundary).
So we can remove SUPPRESS_ASAN. And we also use the same code for x64 too.

* Source/WTF/wtf/text/StringCommon.cpp:
(WTF::find16AlignedImpl): Deleted.
(WTF::find32AlignedImpl): Deleted.
(WTF::find64AlignedImpl): Deleted.
* Source/WTF/wtf/text/StringCommon.h:

Canonical link: https://commits.webkit.org/278641@main
  • Loading branch information
Constellation committed May 10, 2024
1 parent a99b31f commit 91262be
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 203 deletions.
116 changes: 116 additions & 0 deletions Source/WTF/wtf/SIMDHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ constexpr simde_uint16x8_t splat(uint16_t code)
return simde_uint16x8_t { code, code, code, code, code, code, code, code };
}

constexpr simde_uint32x4_t splat(uint32_t code)
{
return simde_uint32x4_t { code, code, code, code };
}

constexpr simde_uint64x2_t splat(uint64_t code)
{
return simde_uint64x2_t { code, code };
}

ALWAYS_INLINE simde_uint8x16_t load(const uint8_t* ptr)
{
return simde_vld1q_u8(ptr);
Expand All @@ -51,6 +61,16 @@ ALWAYS_INLINE simde_uint16x8_t load(const uint16_t* ptr)
return simde_vld1q_u16(ptr);
}

ALWAYS_INLINE simde_uint32x4_t load(const uint32_t* ptr)
{
return simde_vld1q_u32(ptr);
}

ALWAYS_INLINE simde_uint64x2_t load(const uint64_t* ptr)
{
return simde_vld1q_u64(ptr);
}

ALWAYS_INLINE void store(simde_uint8x16_t value, uint8_t* ptr)
{
return simde_vst1q_u8(ptr, value);
Expand All @@ -61,6 +81,16 @@ ALWAYS_INLINE void store(simde_uint16x8_t value, uint16_t* ptr)
return simde_vst1q_u16(ptr, value);
}

ALWAYS_INLINE void store(simde_uint32x4_t value, uint32_t* ptr)
{
return simde_vst1q_u32(ptr, value);
}

ALWAYS_INLINE void store(simde_uint64x2_t value, uint64_t* ptr)
{
return simde_vst1q_u64(ptr, value);
}

ALWAYS_INLINE simde_uint8x16_t merge(simde_uint8x16_t accumulated, simde_uint8x16_t input)
{
return simde_vorrq_u8(accumulated, input);
Expand All @@ -71,6 +101,16 @@ ALWAYS_INLINE simde_uint16x8_t merge(simde_uint16x8_t accumulated, simde_uint16x
return simde_vorrq_u16(accumulated, input);
}

ALWAYS_INLINE simde_uint32x4_t merge(simde_uint32x4_t accumulated, simde_uint32x4_t input)
{
return simde_vorrq_u32(accumulated, input);
}

ALWAYS_INLINE simde_uint64x2_t merge(simde_uint64x2_t accumulated, simde_uint64x2_t input)
{
return simde_vorrq_u64(accumulated, input);
}

ALWAYS_INLINE bool isNonZero(simde_uint8x16_t accumulated)
{
#if CPU(X86_64)
Expand All @@ -91,6 +131,16 @@ ALWAYS_INLINE bool isNonZero(simde_uint16x8_t accumulated)
#endif
}

ALWAYS_INLINE bool isNonZero(simde_uint32x4_t accumulated)
{
#if CPU(X86_64)
auto raw = simde_uint32x4_to_m128i(accumulated);
return !simde_mm_test_all_zeros(raw, raw);
#else
return simde_vmaxvq_u32(accumulated);
#endif
}

ALWAYS_INLINE std::optional<uint8_t> findFirstNonZeroIndex(simde_uint8x16_t value)
{
#if CPU(X86_64)
Expand All @@ -116,13 +166,59 @@ ALWAYS_INLINE std::optional<uint8_t> findFirstNonZeroIndex(simde_uint16x8_t valu
return std::nullopt;
return std::countr_zero(mask) >> 1;
#else
// Incoming value is a comparison result, where each vector element is either all 1s or 0s.
if (!isNonZero(value))
return std::nullopt;
constexpr simde_uint16x8_t indexMask { 0, 1, 2, 3, 4, 5, 6, 7 };
// Found elements are all-1 and the other elements are 0. But it is possible that this vector
// includes multiple found characters. We perform [0, 1, 2, 3, 4, 5, 6, 7] OR-NOT with this value,
// to assign the index to found characters.
// Find the smallest value. Because of [0, 1, 2, 3, 4, 5, 6, 7], the value should be index in this vector.
// If the index less than length, it is within the requested pointer. Otherwise, nullptr.
//
// Example
// value |0|0|0|X|0|X|0|0| (X is all-one)
// not-value |X|X|X|0|X|0|X|X|
// index-value |0|1|2|3|4|5|6|7|
// ranked |X|X|X|3|X|5|X|X|
// index 3, the smallest number from this vector, and it is the same to the index.
return simde_vminvq_u16(simde_vornq_u16(indexMask, value));
#endif
}

ALWAYS_INLINE std::optional<uint8_t> findFirstNonZeroIndex(simde_uint32x4_t value)
{
#if CPU(X86_64)
auto raw = simde_uint32x4_to_m128i(value);
uint16_t mask = simde_mm_movemask_epi8(raw);
if (!mask)
return std::nullopt;
return std::countr_zero(mask) >> 2;
#else
if (!isNonZero(value))
return std::nullopt;
constexpr simde_uint32x4_t indexMask { 0, 1, 2, 3 };
return simde_vminvq_u32(simde_vornq_u32(indexMask, value));
#endif
}

ALWAYS_INLINE std::optional<uint8_t> findFirstNonZeroIndex(simde_uint64x2_t value)
{
#if CPU(X86_64)
auto raw = simde_uint64x2_to_m128i(value);
uint16_t mask = simde_mm_movemask_epi8(raw);
if (!mask)
return std::nullopt;
return std::countr_zero(mask) >> 3;
#else
simde_uint32x2_t reducedMask = simde_vmovn_u64(value);
if (!simde_vget_lane_u64(simde_vreinterpret_u64_u32(reducedMask), 0))
return std::nullopt;
constexpr simde_uint32x2_t indexMask { 0, 1 }; // It is intentionally uint32x2_t.
return simde_vminv_u32(simde_vorn_u32(indexMask, reducedMask));
#endif
}

template<LChar character, LChar... characters>
ALWAYS_INLINE simde_uint8x16_t equal(simde_uint8x16_t input)
{
Expand Down Expand Up @@ -153,6 +249,16 @@ ALWAYS_INLINE simde_uint16x8_t equal(simde_uint16x8_t lhs, simde_uint16x8_t rhs)
return simde_vceqq_u16(lhs, rhs);
}

ALWAYS_INLINE simde_uint32x4_t equal(simde_uint32x4_t lhs, simde_uint32x4_t rhs)
{
return simde_vceqq_u32(lhs, rhs);
}

ALWAYS_INLINE simde_uint64x2_t equal(simde_uint64x2_t lhs, simde_uint64x2_t rhs)
{
return simde_vceqq_u64(lhs, rhs);
}

ALWAYS_INLINE simde_uint8x16_t lessThan(simde_uint8x16_t lhs, simde_uint8x16_t rhs)
{
return simde_vcltq_u8(lhs, rhs);
Expand All @@ -163,6 +269,16 @@ ALWAYS_INLINE simde_uint16x8_t lessThan(simde_uint16x8_t lhs, simde_uint16x8_t r
return simde_vcltq_u16(lhs, rhs);
}

ALWAYS_INLINE simde_uint32x4_t lessThan(simde_uint32x4_t lhs, simde_uint32x4_t rhs)
{
return simde_vcltq_u32(lhs, rhs);
}

ALWAYS_INLINE simde_uint64x2_t lessThan(simde_uint64x2_t lhs, simde_uint64x2_t rhs)
{
return simde_vcltq_u64(lhs, rhs);
}

}

namespace SIMD = WTF::SIMD;
114 changes: 0 additions & 114 deletions Source/WTF/wtf/text/StringCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,120 +28,6 @@

namespace WTF {

// Suppress ASan because this code intentionally loads out-of-bound memory, but it must be safe since we do not overlap page boundary.
SUPPRESS_ASAN
const uint16_t* find16AlignedImpl(const uint16_t* pointer, uint16_t character, size_t length)
{
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0x1));

constexpr simde_uint16x8_t indexMask { 0, 1, 2, 3, 4, 5, 6, 7 };

// Our load is always aligned to 16byte. So long as at least one character exists in this range,
// access must succeed since it does not overlap with the page boundary.

ASSERT(length);
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0xf));
ASSERT((reinterpret_cast<uintptr_t>(pointer) & ~static_cast<uintptr_t>(0xf)) == reinterpret_cast<uintptr_t>(pointer));
const uint16_t* cursor = pointer;
constexpr size_t stride = 16 / sizeof(uint16_t);

// Dupe character => |c|c|c|c|c|c|c|c|
simde_uint16x8_t charactersVector = simde_vdupq_n_u16(character);

while (true) {
// Load target value. It is possible that this includes unrelated part of the memory.
simde_uint16x8_t value = simde_vld1q_u16(cursor);
// If the character is the same, then it becomes all-1. Otherwise, it becomes 0.
simde_uint16x8_t mask = simde_vceqq_u16(value, charactersVector);

// value |c|c|c|C|c|C|c|c| (c is character, C is matching character)
// eq with charactersVector |0|0|0|X|0|X|0|0| (X is all-1)
// Reduce to uint8x8_t |0|0|0|X|0|X|0|0| => reinterpret it as uint64_t. If it is non-zero, matching character exists.
if (simde_vget_lane_u64(simde_vreinterpret_u64_u8(simde_vmovn_u16(mask)), 0)) {
// Found elements are all-1 and the other elements are 0. But it is possible that this vector
// includes multiple found characters. We perform [0, 1, 2, 3, 4, 5, 6, 7] OR-NOT with this mask,
// to assign the index to found characters.
simde_uint16x8_t ranked = simde_vornq_u16(indexMask, mask);
// Find the smallest value. Because of [0, 1, 2, 3, 4, 5, 6, 7], the value should be index in this vector.
uint16_t index = simde_vminvq_u16(ranked);
// If the index less than length, it is within the requested pointer. Otherwise, nullptr.
//
// Example
// mask |0|0|0|X|0|X|0|0| (X is all-one)
// not-mask |X|X|X|0|X|0|X|X|
// index-mask |0|1|2|3|4|5|6|7|
// ranked |X|X|X|3|X|5|X|X|
// index 3, the smallest number from this vector, and it is the same to the index.
return (index < length) ? cursor + index : nullptr;
}
if (length <= stride)
return nullptr;
length -= stride;
cursor += stride;
}
}

SUPPRESS_ASAN
const uint32_t* find32AlignedImpl(const uint32_t* pointer, uint32_t character, size_t length)
{
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0b11));

constexpr simde_uint32x4_t indexMask { 0, 1, 2, 3 };

ASSERT(length);
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0xf));
ASSERT((reinterpret_cast<uintptr_t>(pointer) & ~static_cast<uintptr_t>(0xf)) == reinterpret_cast<uintptr_t>(pointer));
const uint32_t* cursor = pointer;
constexpr size_t stride = 16 / sizeof(uint32_t);

simde_uint32x4_t charactersVector = simde_vdupq_n_u32(character);

while (true) {
simde_uint32x4_t value = simde_vld1q_u32(cursor);
simde_uint32x4_t mask = simde_vceqq_u32(value, charactersVector);
if (simde_vget_lane_u64(simde_vreinterpret_u64_u16(simde_vmovn_u32(mask)), 0)) {
simde_uint32x4_t ranked = simde_vornq_u32(indexMask, mask);
uint32_t index = simde_vminvq_u32(ranked);
return (index < length) ? cursor + index : nullptr;
}
if (length <= stride)
return nullptr;
length -= stride;
cursor += stride;
}
}

SUPPRESS_ASAN
const uint64_t* find64AlignedImpl(const uint64_t* pointer, uint64_t character, size_t length)
{
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0b111));

constexpr simde_uint32x2_t indexMask { 0, 1 };

ASSERT(length);
ASSERT(!(reinterpret_cast<uintptr_t>(pointer) & 0xf));
ASSERT((reinterpret_cast<uintptr_t>(pointer) & ~static_cast<uintptr_t>(0xf)) == reinterpret_cast<uintptr_t>(pointer));
const uint64_t* cursor = pointer;
constexpr size_t stride = 16 / sizeof(uint64_t);

simde_uint64x2_t charactersVector = simde_vdupq_n_u64(character);

while (true) {
simde_uint64x2_t value = simde_vld1q_u64(cursor);
simde_uint64x2_t mask = simde_vceqq_u64(value, charactersVector);
simde_uint32x2_t reducedMask = simde_vmovn_u64(mask);
if (simde_vget_lane_u64(simde_vreinterpret_u64_u32(reducedMask), 0)) {
simde_uint32x2_t ranked = simde_vorn_u32(indexMask, reducedMask);
uint64_t index = simde_vminv_u32(ranked);
return (index < length) ? cursor + index : nullptr;
}
if (length <= stride)
return nullptr;
length -= stride;
cursor += stride;
}
}

SUPPRESS_ASAN
const float* findFloatAlignedImpl(const float* pointer, float target, size_t length)
{
Expand Down
Loading

0 comments on commit 91262be

Please sign in to comment.