Skip to content

Commit

Permalink
Scan BidiRTL possibility via SIMD
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=274705
rdar://128719350

Reviewed by Sam Weinig.

Deploy optimization for BidiRTL possibility scanning via SIMD.
We clean up SIMDHelpers more to make code simpler

1. Add variadic bitOr / bitAnd / merge to simplify existing implementations.
2. Add all comparisons (lessThan, lessThanOrEqual, greaterThan, greaterThanOrEqual).
3. Make splat more easy-to-use form.
4. Add SIMD::stride helper inline variable to compute stride easily.

For existing implementations, I just deployed this new form. But in the furture, we would like
to extract common pattern and further simplifies the implementations. But for now, let's just
make BidiRTL scanning fast.

* Source/JavaScriptCore/runtime/JSONObject.cpp:
(JSC::FastStringifier<CharType>::append):
* Source/JavaScriptCore/runtime/LiteralParser.cpp:
(JSC::LiteralParser<CharType>::Lexer::lexString):
* Source/WTF/wtf/SIMDHelpers.h:
(WTF::SIMD::splat8):
(WTF::SIMD::splat16):
(WTF::SIMD::splat32):
(WTF::SIMD::splat64):
(WTF::SIMD::splat):
(WTF::SIMD::merge2):
(WTF::SIMD::bitOr2):
(WTF::SIMD::bitAnd2):
(WTF::SIMD::merge):
(WTF::SIMD::bitOr):
(WTF::SIMD::bitAnd):
(WTF::SIMD::bitNot):
(WTF::SIMD::lessThanOrEqual):
(WTF::SIMD::greaterThan):
(WTF::SIMD::greaterThanOrEqual):
* Source/WTF/wtf/text/StringCommon.h:
(WTF::findImpl):
(WTF::charactersContain):
* Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp:
(WebCore::HTMLFastPathParser::scanText):
(WebCore::HTMLFastPathParser::scanAttributeValue):
* Source/WebCore/layout/formattingContexts/inline/text/TextUtil.cpp:
(WebCore::Layout::TextUtil::containsStrongDirectionalityText):

Canonical link: https://commits.webkit.org/279356@main
  • Loading branch information
Constellation committed May 27, 2024
1 parent 5397a28 commit 3e50aea
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 53 deletions.
38 changes: 19 additions & 19 deletions Source/JavaScriptCore/runtime/JSONObject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,13 +1063,13 @@ void FastStringifier<CharType>::append(JSValue value)

auto charactersCopySameType = [&](auto span, auto* cursor) ALWAYS_INLINE_LAMBDA {
#if (CPU(ARM64) || CPU(X86_64)) && COMPILER(CLANG)
constexpr size_t stride = 16 / sizeof(CharType);
constexpr size_t stride = SIMD::stride<CharType>;
if (span.size() >= stride) {
using UnsignedType = std::make_unsigned_t<CharType>;
using BulkType = decltype(SIMD::load(static_cast<const UnsignedType*>(nullptr)));
constexpr auto quoteMask = SIMD::splat(static_cast<UnsignedType>('"'));
constexpr auto escapeMask = SIMD::splat(static_cast<UnsignedType>('\\'));
constexpr auto controlMask = SIMD::splat(static_cast<UnsignedType>(' '));
constexpr auto quoteMask = SIMD::splat<UnsignedType>('"');
constexpr auto escapeMask = SIMD::splat<UnsignedType>('\\');
constexpr auto controlMask = SIMD::splat<UnsignedType>(' ');
const auto* ptr = span.data();
const auto* end = ptr + span.size();
auto* cursorEnd = cursor + span.size();
Expand All @@ -1080,11 +1080,11 @@ void FastStringifier<CharType>::append(JSValue value)
auto quotes = SIMD::equal(input, quoteMask);
auto escapes = SIMD::equal(input, escapeMask);
auto controls = SIMD::lessThan(input, controlMask);
accumulated = SIMD::merge(accumulated, SIMD::merge(quotes, SIMD::merge(escapes, controls)));
accumulated = SIMD::bitOr(accumulated, quotes, escapes, controls);
if constexpr (sizeof(CharType) != 1) {
constexpr auto surrogateMask = SIMD::splat(static_cast<UnsignedType>(0xf800));
constexpr auto surrogateCheckMask = SIMD::splat(static_cast<UnsignedType>(0xd800));
accumulated = SIMD::merge(accumulated, SIMD::equal(simde_vandq_u16(input, surrogateMask), surrogateCheckMask));
constexpr auto surrogateMask = SIMD::splat<UnsignedType>(0xf800);
constexpr auto surrogateCheckMask = SIMD::splat<UnsignedType>(0xd800);
accumulated = SIMD::bitOr(accumulated, SIMD::equal(SIMD::bitAnd(input, surrogateMask), surrogateCheckMask));
}
}
if (ptr < end) {
Expand All @@ -1093,11 +1093,11 @@ void FastStringifier<CharType>::append(JSValue value)
auto quotes = SIMD::equal(input, quoteMask);
auto escapes = SIMD::equal(input, escapeMask);
auto controls = SIMD::lessThan(input, controlMask);
accumulated = SIMD::merge(accumulated, SIMD::merge(quotes, SIMD::merge(escapes, controls)));
accumulated = SIMD::bitOr(accumulated, quotes, escapes, controls);
if constexpr (sizeof(CharType) != 1) {
constexpr auto surrogateMask = SIMD::splat(static_cast<UnsignedType>(0xf800));
constexpr auto surrogateCheckMask = SIMD::splat(static_cast<UnsignedType>(0xd800));
accumulated = SIMD::merge(accumulated, SIMD::equal(simde_vandq_u16(input, surrogateMask), surrogateCheckMask));
constexpr auto surrogateMask = SIMD::splat<UnsignedType>(0xf800);
constexpr auto surrogateCheckMask = SIMD::splat<UnsignedType>(0xd800);
accumulated = SIMD::bitOr(accumulated, SIMD::equal(SIMD::bitAnd(input, surrogateMask), surrogateCheckMask));
}
}
return SIMD::isNonZero(accumulated);
Expand All @@ -1117,14 +1117,14 @@ void FastStringifier<CharType>::append(JSValue value)

auto charactersCopyUpconvert = [&](std::span<const LChar> span, UChar* cursor) ALWAYS_INLINE_LAMBDA {
#if (CPU(ARM64) || CPU(X86_64)) && COMPILER(CLANG)
constexpr size_t stride = 16 / sizeof(LChar);
constexpr size_t stride = SIMD::stride<LChar>;
if (span.size() >= stride) {
using UnsignedType = std::make_unsigned_t<LChar>;
using BulkType = decltype(SIMD::load(static_cast<const UnsignedType*>(nullptr)));
constexpr auto quoteMask = SIMD::splat(static_cast<UnsignedType>('"'));
constexpr auto escapeMask = SIMD::splat(static_cast<UnsignedType>('\\'));
constexpr auto controlMask = SIMD::splat(static_cast<UnsignedType>(' '));
constexpr auto zeros = SIMD::splat(static_cast<UnsignedType>(0));
constexpr auto quoteMask = SIMD::splat<UnsignedType>('"');
constexpr auto escapeMask = SIMD::splat<UnsignedType>('\\');
constexpr auto controlMask = SIMD::splat<UnsignedType>(' ');
constexpr auto zeros = SIMD::splat<UnsignedType>(0);
const auto* ptr = span.data();
const auto* end = ptr + span.size();
auto* cursorEnd = cursor + span.size();
Expand All @@ -1135,15 +1135,15 @@ void FastStringifier<CharType>::append(JSValue value)
auto quotes = SIMD::equal(input, quoteMask);
auto escapes = SIMD::equal(input, escapeMask);
auto controls = SIMD::lessThan(input, controlMask);
accumulated = SIMD::merge(accumulated, SIMD::merge(quotes, SIMD::merge(escapes, controls)));
accumulated = SIMD::bitOr(accumulated, quotes, escapes, controls);
}
if (ptr < end) {
auto input = SIMD::load(bitwise_cast<const UnsignedType*>(end - stride));
simde_vst2q_u8(bitwise_cast<UnsignedType*>(cursorEnd - stride), (simde_uint8x16x2_t { input, zeros }));
auto quotes = SIMD::equal(input, quoteMask);
auto escapes = SIMD::equal(input, escapeMask);
auto controls = SIMD::lessThan(input, controlMask);
accumulated = SIMD::merge(accumulated, SIMD::merge(quotes, SIMD::merge(escapes, controls)));
accumulated = SIMD::bitOr(accumulated, quotes, escapes, controls);
}
return SIMD::isNonZero(accumulated);
}
Expand Down
10 changes: 5 additions & 5 deletions Source/JavaScriptCore/runtime/LiteralParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -869,18 +869,18 @@ ALWAYS_INLINE TokenType LiteralParser<CharType>::Lexer::lexString(LiteralParserT
} else {
([&]() ALWAYS_INLINE_LAMBDA {
#if CPU(ARM64) || CPU(X86_64)
constexpr size_t stride = 16 / sizeof(CharType);
constexpr size_t stride = SIMD::stride<CharType>;
using UnsignedType = std::make_unsigned_t<CharType>;
if (static_cast<size_t>(m_end - m_ptr) >= stride) {
constexpr auto quoteMask = SIMD::splat(static_cast<UnsignedType>('"'));
constexpr auto escapeMask = SIMD::splat(static_cast<UnsignedType>('\\'));
constexpr auto controlMask = SIMD::splat(static_cast<UnsignedType>(' '));
constexpr auto quoteMask = SIMD::splat<UnsignedType>('"');
constexpr auto escapeMask = SIMD::splat<UnsignedType>('\\');
constexpr auto controlMask = SIMD::splat<UnsignedType>(' ');
auto match = [&](auto* cursor) ALWAYS_INLINE_LAMBDA {
auto input = SIMD::load(bitwise_cast<const UnsignedType*>(cursor));
auto quotes = SIMD::equal(input, quoteMask);
auto escapes = SIMD::equal(input, escapeMask);
auto controls = SIMD::lessThan(input, controlMask);
auto mask = SIMD::merge(quotes, SIMD::merge(escapes, controls));
auto mask = SIMD::bitOr(quotes, escapes, controls);
return SIMD::findFirstNonZeroIndex(mask);
};

Expand Down
208 changes: 200 additions & 8 deletions Source/WTF/wtf/SIMDHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,71 @@

namespace WTF::SIMD {

constexpr simde_uint8x16_t splat(uint8_t code)
template<typename LaneType>
struct LaneToVector;

template<>
struct LaneToVector<uint8_t> {
using Type = simde_uint8x16_t;
};

template<>
struct LaneToVector<uint16_t> {
using Type = simde_uint16x8_t;
};

template<>
struct LaneToVector<uint32_t> {
using Type = simde_uint32x4_t;
};

template<>
struct LaneToVector<uint64_t> {
using Type = simde_uint64x2_t;
};

template<typename LaneType>
using VectorType = typename LaneToVector<LaneType>::Type;


template<typename LaneType>
inline constexpr size_t stride = 16 / sizeof(LaneType);

constexpr simde_uint8x16_t splat8(uint8_t code)
{
return simde_uint8x16_t { code, code, code, code, code, code, code, code, code, code, code, code, code, code, code, code };
}

constexpr simde_uint16x8_t splat(uint16_t code)
constexpr simde_uint16x8_t splat16(uint16_t code)
{
return simde_uint16x8_t { code, code, code, code, code, code, code, code };
}

constexpr simde_uint32x4_t splat(uint32_t code)
constexpr simde_uint32x4_t splat32(uint32_t code)
{
return simde_uint32x4_t { code, code, code, code };
}

constexpr simde_uint64x2_t splat(uint64_t code)
constexpr simde_uint64x2_t splat64(uint64_t code)
{
return simde_uint64x2_t { code, code };
}

template<typename LaneType>
ALWAYS_INLINE constexpr decltype(auto) splat(LaneType lane)
{
if constexpr (sizeof(LaneType) == sizeof(uint8_t))
return splat8(static_cast<uint8_t>(lane));
else if constexpr (sizeof(LaneType) == sizeof(uint16_t))
return splat16(static_cast<uint16_t>(lane));
else if constexpr (sizeof(LaneType) == sizeof(uint32_t))
return splat32(static_cast<uint32_t>(lane));
else {
static_assert(sizeof(LaneType) == sizeof(uint64_t));
return splat64(static_cast<uint64_t>(lane));
}
}

ALWAYS_INLINE simde_uint8x16_t load(const uint8_t* ptr)
{
return simde_vld1q_u8(ptr);
Expand Down Expand Up @@ -91,26 +136,113 @@ ALWAYS_INLINE void store(simde_uint64x2_t value, uint64_t* ptr)
return simde_vst1q_u64(ptr, value);
}

ALWAYS_INLINE simde_uint8x16_t merge(simde_uint8x16_t accumulated, simde_uint8x16_t input)
ALWAYS_INLINE simde_uint8x16_t merge2(simde_uint8x16_t accumulated, simde_uint8x16_t input)
{
return simde_vorrq_u8(accumulated, input);
}

ALWAYS_INLINE simde_uint16x8_t merge2(simde_uint16x8_t accumulated, simde_uint16x8_t input)
{
return simde_vorrq_u16(accumulated, input);
}

ALWAYS_INLINE simde_uint32x4_t merge2(simde_uint32x4_t accumulated, simde_uint32x4_t input)
{
return simde_vorrq_u32(accumulated, input);
}

ALWAYS_INLINE simde_uint64x2_t merge2(simde_uint64x2_t accumulated, simde_uint64x2_t input)
{
return simde_vorrq_u64(accumulated, input);
}

ALWAYS_INLINE simde_uint8x16_t bitOr2(simde_uint8x16_t accumulated, simde_uint8x16_t input)
{
return simde_vorrq_u8(accumulated, input);
}

ALWAYS_INLINE simde_uint16x8_t merge(simde_uint16x8_t accumulated, simde_uint16x8_t input)
ALWAYS_INLINE simde_uint16x8_t bitOr2(simde_uint16x8_t accumulated, simde_uint16x8_t input)
{
return simde_vorrq_u16(accumulated, input);
}

ALWAYS_INLINE simde_uint32x4_t merge(simde_uint32x4_t accumulated, simde_uint32x4_t input)
ALWAYS_INLINE simde_uint32x4_t bitOr2(simde_uint32x4_t accumulated, simde_uint32x4_t input)
{
return simde_vorrq_u32(accumulated, input);
}

ALWAYS_INLINE simde_uint64x2_t merge(simde_uint64x2_t accumulated, simde_uint64x2_t input)
ALWAYS_INLINE simde_uint64x2_t bitOr2(simde_uint64x2_t accumulated, simde_uint64x2_t input)
{
return simde_vorrq_u64(accumulated, input);
}

ALWAYS_INLINE simde_uint8x16_t bitAnd2(simde_uint8x16_t accumulated, simde_uint8x16_t input)
{
return simde_vandq_u8(accumulated, input);
}

ALWAYS_INLINE simde_uint16x8_t bitAnd2(simde_uint16x8_t accumulated, simde_uint16x8_t input)
{
return simde_vandq_u16(accumulated, input);
}

ALWAYS_INLINE simde_uint32x4_t bitAnd2(simde_uint32x4_t accumulated, simde_uint32x4_t input)
{
return simde_vandq_u32(accumulated, input);
}

ALWAYS_INLINE simde_uint64x2_t bitAnd2(simde_uint64x2_t accumulated, simde_uint64x2_t input)
{
return simde_vandq_u64(accumulated, input);
}

template<typename VectorType, typename... Args>
ALWAYS_INLINE decltype(auto) merge(VectorType a0, VectorType a1, Args... args)
{
if constexpr (!sizeof...(args))
return merge2(a0, a1);
else
return merge2(a0, merge(a1, std::forward<Args>(args)...));
}

template<typename VectorType, typename... Args>
ALWAYS_INLINE decltype(auto) bitOr(VectorType a0, VectorType a1, Args... args)
{
if constexpr (!sizeof...(args))
return bitOr2(a0, a1);
else
return bitOr2(a0, bitOr(a1, std::forward<Args>(args)...));
}

template<typename VectorType, typename... Args>
ALWAYS_INLINE decltype(auto) bitAnd(VectorType a0, VectorType a1, Args... args)
{
if constexpr (!sizeof...(args))
return bitAnd2(a0, a1);
else
return bitAnd2(a0, bitAnd(a1, std::forward<Args>(args)...));
}

ALWAYS_INLINE simde_uint8x16_t bitNot(simde_uint8x16_t input)
{
return simde_vmvnq_u8(input);
}

ALWAYS_INLINE simde_uint16x8_t bitNot(simde_uint16x8_t input)
{
return simde_vmvnq_u16(input);
}

ALWAYS_INLINE simde_uint32x4_t bitNot(simde_uint32x4_t input)
{
return simde_vmvnq_u32(input);
}

ALWAYS_INLINE simde_uint64x2_t bitNot(simde_uint64x2_t input)
{
return simde_vreinterpretq_u64_u32(simde_vmvnq_u32(simde_vreinterpretq_u32_u64(input)));
}

ALWAYS_INLINE bool isNonZero(simde_uint8x16_t accumulated)
{
#if CPU(X86_64)
Expand Down Expand Up @@ -279,6 +411,66 @@ ALWAYS_INLINE simde_uint64x2_t lessThan(simde_uint64x2_t lhs, simde_uint64x2_t r
return simde_vcltq_u64(lhs, rhs);
}

ALWAYS_INLINE simde_uint8x16_t lessThanOrEqual(simde_uint8x16_t lhs, simde_uint8x16_t rhs)
{
return simde_vcleq_u8(lhs, rhs);
}

ALWAYS_INLINE simde_uint16x8_t lessThanOrEqual(simde_uint16x8_t lhs, simde_uint16x8_t rhs)
{
return simde_vcleq_u16(lhs, rhs);
}

ALWAYS_INLINE simde_uint32x4_t lessThanOrEqual(simde_uint32x4_t lhs, simde_uint32x4_t rhs)
{
return simde_vcleq_u32(lhs, rhs);
}

ALWAYS_INLINE simde_uint64x2_t lessThanOrEqual(simde_uint64x2_t lhs, simde_uint64x2_t rhs)
{
return simde_vcleq_u64(lhs, rhs);
}

ALWAYS_INLINE simde_uint8x16_t greaterThan(simde_uint8x16_t lhs, simde_uint8x16_t rhs)
{
return simde_vcgtq_u8(lhs, rhs);
}

ALWAYS_INLINE simde_uint16x8_t greaterThan(simde_uint16x8_t lhs, simde_uint16x8_t rhs)
{
return simde_vcgtq_u16(lhs, rhs);
}

ALWAYS_INLINE simde_uint32x4_t greaterThan(simde_uint32x4_t lhs, simde_uint32x4_t rhs)
{
return simde_vcgtq_u32(lhs, rhs);
}

ALWAYS_INLINE simde_uint64x2_t greaterThan(simde_uint64x2_t lhs, simde_uint64x2_t rhs)
{
return simde_vcgtq_u64(lhs, rhs);
}

ALWAYS_INLINE simde_uint8x16_t greaterThanOrEqual(simde_uint8x16_t lhs, simde_uint8x16_t rhs)
{
return simde_vcgeq_u8(lhs, rhs);
}

ALWAYS_INLINE simde_uint16x8_t greaterThanOrEqual(simde_uint16x8_t lhs, simde_uint16x8_t rhs)
{
return simde_vcgeq_u16(lhs, rhs);
}

ALWAYS_INLINE simde_uint32x4_t greaterThanOrEqual(simde_uint32x4_t lhs, simde_uint32x4_t rhs)
{
return simde_vcgeq_u32(lhs, rhs);
}

ALWAYS_INLINE simde_uint64x2_t greaterThanOrEqual(simde_uint64x2_t lhs, simde_uint64x2_t rhs)
{
return simde_vcgeq_u64(lhs, rhs);
}

}

namespace SIMD = WTF::SIMD;
Loading

0 comments on commit 3e50aea

Please sign in to comment.