From e8401be1912bd7ccd878b2605d7edf2c91cf74f4 Mon Sep 17 00:00:00 2001 From: Yusuke Suzuki Date: Fri, 10 May 2024 22:15:05 -0700 Subject: [PATCH] Adopt SIMD in attribute and text scanning https://bugs.webkit.org/show_bug.cgi?id=273977 rdar://127843610 Reviewed by Mark Lam. This patch integrates SIMD into attribute value and string scanning. We also attach UNLIKELY / LIKELY to bailout paths. * Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp: (WebCore::HTMLFastPathParser::parseCompleteInput): (WebCore::HTMLFastPathParser::scanText): (WebCore::HTMLFastPathParser::scanTagName): (WebCore::HTMLFastPathParser::scanAttributeValue): (WebCore::HTMLFastPathParser::parseChildren): (WebCore::HTMLFastPathParser::parseAttributes): (WebCore::HTMLFastPathParser::parseContainerElement): Canonical link: https://commits.webkit.org/278647@main --- .../JavaScriptCore/runtime/LiteralParser.cpp | 17 +-- Source/WTF/wtf/text/StringParsingBuffer.h | 2 +- .../parser/HTMLDocumentParserFastPath.cpp | 138 ++++++++++++++---- 3 files changed, 121 insertions(+), 36 deletions(-) diff --git a/Source/JavaScriptCore/runtime/LiteralParser.cpp b/Source/JavaScriptCore/runtime/LiteralParser.cpp index 51bbf6faaebc..efa6e2bee93c 100644 --- a/Source/JavaScriptCore/runtime/LiteralParser.cpp +++ b/Source/JavaScriptCore/runtime/LiteralParser.cpp @@ -875,24 +875,23 @@ ALWAYS_INLINE TokenType LiteralParser::Lexer::lexString(LiteralParserT constexpr auto quoteMask = SIMD::splat(static_cast('"')); constexpr auto escapeMask = SIMD::splat(static_cast('\\')); constexpr auto controlMask = SIMD::splat(static_cast(' ')); - for (; m_ptr + (stride - 1) < m_end; m_ptr += stride) { - auto input = SIMD::load(bitwise_cast(m_ptr)); + auto match = [&](auto* cursor) ALWAYS_INLINE_LAMBDA { + auto input = SIMD::load(bitwise_cast(cursor)); auto quotes = SIMD::equal(input, quoteMask); auto escapes = SIMD::equal(input, escapeMask); auto controls = SIMD::lessThan(input, controlMask); auto mask = SIMD::merge(quotes, SIMD::merge(escapes, controls)); - if (auto index = SIMD::findFirstNonZeroIndex(mask)) { + return SIMD::findFirstNonZeroIndex(mask); + }; + + for (; m_ptr + (stride - 1) < m_end; m_ptr += stride) { + if (auto index = match(m_ptr)) { m_ptr += index.value(); return; } } if (m_ptr < m_end) { - auto input = SIMD::load(bitwise_cast(m_end - stride)); - auto quotes = SIMD::equal(input, quoteMask); - auto escapes = SIMD::equal(input, escapeMask); - auto controls = SIMD::lessThan(input, controlMask); - auto mask = SIMD::merge(quotes, SIMD::merge(escapes, controls)); - if (auto index = SIMD::findFirstNonZeroIndex(mask)) { + if (auto index = match(m_end - stride)) { m_ptr = m_end - stride + index.value(); return; } diff --git a/Source/WTF/wtf/text/StringParsingBuffer.h b/Source/WTF/wtf/text/StringParsingBuffer.h index aad777c6c654..00a3955ea2e6 100644 --- a/Source/WTF/wtf/text/StringParsingBuffer.h +++ b/Source/WTF/wtf/text/StringParsingBuffer.h @@ -53,7 +53,7 @@ class StringParsingBuffer final { constexpr void setPosition(const CharacterType* position) { - ASSERT(!m_data.empty()); + ASSERT(position <= m_data.data() + m_data.size()); m_data = { position, m_data.data() + m_data.size() }; } diff --git a/Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp b/Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp index 4b9d46b7daa3..788fc8363bf7 100644 --- a/Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp +++ b/Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp @@ -466,7 +466,7 @@ class HTMLFastPathParser { template void parseCompleteInput() { parseChildren(m_destinationParent.get()); - if (m_parsingBuffer.hasCharactersRemaining()) + if (UNLIKELY(m_parsingBuffer.hasCharactersRemaining())) didFail(HTMLFastPathResult::FailedDidntReachEndOfInput); } @@ -477,21 +477,65 @@ class HTMLFastPathParser { String scanText() { auto* start = m_parsingBuffer.position(); - while (m_parsingBuffer.hasCharactersRemaining() && *m_parsingBuffer != '<') { - // '&' indicates escape sequences, '\r' might require - // https://infra.spec.whatwg.org/#normalize-newlines - if (*m_parsingBuffer == '&' || *m_parsingBuffer == '\r') { - m_parsingBuffer.setPosition(start); - return scanEscapedText(); + auto* cursor = start; + const auto* end = start + m_parsingBuffer.lengthRemaining(); + ([&]() ALWAYS_INLINE_LAMBDA { + constexpr size_t stride = 16 / sizeof(CharacterType); + using UnsignedType = std::make_unsigned_t; + if (static_cast(end - cursor) >= stride) { + const auto quoteMask = SIMD::splat(static_cast('<')); + const auto escapeMask = SIMD::splat(static_cast('&')); + const auto newlineMask = SIMD::splat(static_cast('\r')); + const auto zeroMask = SIMD::splat(static_cast(0)); + + auto match = [&](auto* cursor) ALWAYS_INLINE_LAMBDA { + auto input = SIMD::load(bitwise_cast(cursor)); + auto quotes = SIMD::equal(input, quoteMask); + auto escapes = SIMD::equal(input, escapeMask); + auto newlines = SIMD::equal(input, newlineMask); + auto zeros = SIMD::equal(input, zeroMask); + auto mask = SIMD::merge(zeros, SIMD::merge(quotes, SIMD::merge(escapes, newlines))); + return SIMD::findFirstNonZeroIndex(mask); + }; + + for (; cursor + (stride - 1) < end; cursor += stride) { + if (auto index = match(cursor)) { + cursor += index.value(); + return; + } + } + if (cursor < end) { + if (auto index = match(end - stride)) { + cursor = end - stride + index.value(); + return; + } + cursor = end; + } + return; } - if (UNLIKELY(*m_parsingBuffer == '\0')) + + for (; cursor != end; ++cursor) { + auto character = *cursor; + if (character == '<' || character == '&' || character == '\r' || character == '\0') + return; + } + }()); + m_parsingBuffer.setPosition(cursor); + + if (cursor != end) { + if (UNLIKELY(*cursor == '\0')) return didFail(HTMLFastPathResult::FailedContainsNull, String()); - m_parsingBuffer.advance(); + if (*cursor == '&' || *cursor == '\r') { + m_parsingBuffer.setPosition(start); + return scanEscapedText(); + } } - unsigned length = m_parsingBuffer.position() - start; + + unsigned length = cursor - start; if (UNLIKELY(length >= Text::defaultLengthLimit)) return didFail(HTMLFastPathResult::FailedBigText, String()); + return length ? String({ start, length }) : String(); } @@ -540,7 +584,7 @@ class HTMLFastPathParser { m_parsingBuffer.advance(); m_charBuffer.append(c); } - if (m_parsingBuffer.atEnd() || !isCharAfterTagNameOrAttribute(*m_parsingBuffer)) + if (UNLIKELY(m_parsingBuffer.atEnd() || !isCharAfterTagNameOrAttribute(*m_parsingBuffer))) return didFail(HTMLFastPathResult::FailedParsingTagName, ElementName::Unknown); skipWhile(m_parsingBuffer); return findHTMLElementName(m_charBuffer.span()); @@ -596,22 +640,64 @@ class HTMLFastPathParser { if (m_parsingBuffer.hasCharactersRemaining() && isQuoteCharacter(*m_parsingBuffer)) { auto quoteChar = m_parsingBuffer.consume(); start = m_parsingBuffer.position(); - for (; m_parsingBuffer.hasCharactersRemaining() && *m_parsingBuffer != quoteChar; m_parsingBuffer.advance()) { - if (*m_parsingBuffer == '&' || *m_parsingBuffer == '\r') { - m_parsingBuffer.setPosition(start - 1); - return scanEscapedAttributeValue(); + auto* cursor = start; + const auto* end = start + m_parsingBuffer.lengthRemaining(); + ([&]() ALWAYS_INLINE_LAMBDA { + constexpr size_t stride = 16 / sizeof(CharacterType); + using UnsignedType = std::make_unsigned_t; + if (static_cast(end - cursor) >= stride) { + const auto quoteMask = SIMD::splat(static_cast(quoteChar)); + const auto escapeMask = SIMD::splat(static_cast('&')); + const auto newlineMask = SIMD::splat(static_cast('\r')); + + auto match = [&](auto* cursor) ALWAYS_INLINE_LAMBDA { + auto input = SIMD::load(bitwise_cast(cursor)); + auto quotes = SIMD::equal(input, quoteMask); + auto escapes = SIMD::equal(input, escapeMask); + auto newlines = SIMD::equal(input, newlineMask); + auto mask = SIMD::merge(quotes, SIMD::merge(escapes, newlines)); + return SIMD::findFirstNonZeroIndex(mask); + }; + + for (; cursor + (stride - 1) < end; cursor += stride) { + if (auto index = match(cursor)) { + cursor += index.value(); + return; + } + } + if (cursor < end) { + if (auto index = match(end - stride)) { + cursor = end - stride + index.value(); + return; + } + cursor = end; + } + return; } - } - if (m_parsingBuffer.atEnd()) + + for (; cursor != end; ++cursor) { + auto character = *cursor; + if (character == quoteChar || character == '&' || character == '\r') + return; + } + }()); + + if (UNLIKELY(cursor == end)) return didFail(HTMLFastPathResult::FailedParsingQuotedAttributeValue, emptyAtom()); - length = m_parsingBuffer.position() - start; - if (m_parsingBuffer.consume() != quoteChar) + length = cursor - start; + if (UNLIKELY(*cursor != quoteChar)) { + if (LIKELY(*cursor == '&' || *cursor == '\r')) { + m_parsingBuffer.setPosition(start - 1); + return scanEscapedAttributeValue(); + } return didFail(HTMLFastPathResult::FailedParsingQuotedAttributeValue, emptyAtom()); + } + m_parsingBuffer.setPosition(cursor + 1); } else { skipWhile(m_parsingBuffer); length = m_parsingBuffer.position() - start; - if (m_parsingBuffer.atEnd() || !isCharAfterUnquotedAttribute(*m_parsingBuffer)) + if (UNLIKELY(m_parsingBuffer.atEnd() || !isCharAfterUnquotedAttribute(*m_parsingBuffer))) return didFail(HTMLFastPathResult::FailedParsingUnquotedAttributeValue, emptyAtom()); } return HTMLNameCache::makeAttributeValue({ start, length }); @@ -700,7 +786,7 @@ class HTMLFastPathParser { // We assume that we found the closing tag. The tagName will be checked by the caller `parseContainerElement()`. return; } - if (++m_elementDepth == Settings::defaultMaximumHTMLParserDOMTreeDepth) + if (UNLIKELY(++m_elementDepth == Settings::defaultMaximumHTMLParserDOMTreeDepth)) return didFail(HTMLFastPathResult::FailedMaxDepth); auto child = ParentTag::parseChild(parent, *this); --m_elementDepth; @@ -719,7 +805,7 @@ class HTMLFastPathParser { while (true) { auto attributeName = scanAttributeName(); if (attributeName == nullQName()) { - if (m_parsingBuffer.hasCharactersRemaining()) { + if (LIKELY(m_parsingBuffer.hasCharactersRemaining())) { if (*m_parsingBuffer == '>') { m_parsingBuffer.advance(); break; @@ -727,7 +813,7 @@ class HTMLFastPathParser { if (*m_parsingBuffer == '/') { m_parsingBuffer.advance(); skipWhile(m_parsingBuffer); - if (m_parsingBuffer.atEnd() || m_parsingBuffer.consume() != '>') + if (UNLIKELY(m_parsingBuffer.atEnd() || m_parsingBuffer.consume() != '>')) return didFail(HTMLFastPathResult::FailedParsingAttributes); break; } @@ -847,7 +933,7 @@ class HTMLFastPathParser { parent.parserAppendChild(element); element->beginParsingChildren(); parseChildren(element); - if (parsingFailed() || m_parsingBuffer.atEnd()) + if (UNLIKELY(parsingFailed() || m_parsingBuffer.atEnd())) return didFail(HTMLFastPathResult::FailedEndOfInputReachedForContainer, element); // parseChildren(element) stops after the (hopefully) closing tag's `<` @@ -856,12 +942,12 @@ class HTMLFastPathParser { m_parsingBuffer.advance(); if (UNLIKELY(!skipCharactersExactly(m_parsingBuffer, Tag::tagNameCharacters))) { - if (!skipLettersExactlyIgnoringASCIICase(m_parsingBuffer, Tag::tagNameCharacters)) + if (UNLIKELY(!skipLettersExactlyIgnoringASCIICase(m_parsingBuffer, Tag::tagNameCharacters))) return didFail(HTMLFastPathResult::FailedEndTagNameMismatch, element); } skipWhile(m_parsingBuffer); - if (m_parsingBuffer.atEnd() || m_parsingBuffer.consume() != '>') + if (UNLIKELY(m_parsingBuffer.atEnd() || m_parsingBuffer.consume() != '>')) return didFail(HTMLFastPathResult::FailedUnexpectedTagNameCloseState, element); element->finishParsingChildren();