From b58c8bed4445a0192e15f9bdc67bd0f0c45d4ea2 Mon Sep 17 00:00:00 2001 From: Anne van Kesteren Date: Tue, 4 Apr 2023 10:22:04 -0700 Subject: [PATCH] Use more ICU surrogate code point macros https://bugs.webkit.org/show_bug.cgi?id=254984 rdar://107602109 Reviewed by Darin Adler. Make WebKit rely more on centrally defined checks and constants for code points, in particular for surrogates. * Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp: (JSC::JSImmutableButterfly::createFromString): * Source/JavaScriptCore/runtime/RegExpObjectInlines.h: (JSC::advanceStringUnicode): * Source/WTF/wtf/text/StringImpl.cpp: (WTF::StringImpl::utf8ForCharactersIntoBuffer): * Source/WebCore/PAL/pal/text/TextCodec.cpp: (PAL::TextCodec::getUnencodableReplacement): * Source/WebCore/css/parser/CSSTokenizer.cpp: (WebCore::CSSTokenizer::consumeEscape): * Source/WebCore/dom/TextEncoderStreamEncoder.cpp: (WebCore::TextEncoderStreamEncoder::encode): (WebCore::TextEncoderStreamEncoder::flush): * Source/WebCore/dom/TextEncoderStreamEncoder.h: Canonical link: https://commits.webkit.org/262581@main --- .../runtime/JSImmutableButterfly.cpp | 4 ++-- .../runtime/RegExpObjectInlines.h | 4 ++-- Source/WTF/wtf/text/StringImpl.cpp | 4 ++-- Source/WebCore/PAL/pal/text/TextCodec.cpp | 5 +++-- Source/WebCore/css/parser/CSSTokenizer.cpp | 2 +- .../WebCore/dom/TextEncoderStreamEncoder.cpp | 22 +++++++++---------- Source/WebCore/dom/TextEncoderStreamEncoder.h | 2 +- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp b/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp index 98e9b817c7c0..be59bfa87cf3 100644 --- a/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp +++ b/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp @@ -183,13 +183,13 @@ JSImmutableButterfly* JSImmutableButterfly::createFromString(JSGlobalObject* glo auto forEachCodePointViaStringIteratorProtocol = [](const UChar* characters, unsigned length, auto func) { for (unsigned i = 0; i < length; ++i) { UChar character = characters[i]; - if (character < 0xD800 || character > 0xDBFF || (i + 1) == length) { + if (!U16_IS_LEAD(character) || (i + 1) == length) { if (func(i, 1) == IterationStatus::Done) return; continue; } UChar second = characters[i + 1]; - if (second < 0xDC00 || second > 0xDFFF) { + if (!U16_IS_TRAIL(second)) { if (func(i, 1) == IterationStatus::Done) return; continue; diff --git a/Source/JavaScriptCore/runtime/RegExpObjectInlines.h b/Source/JavaScriptCore/runtime/RegExpObjectInlines.h index e56d0ab74938..968738b23b2a 100644 --- a/Source/JavaScriptCore/runtime/RegExpObjectInlines.h +++ b/Source/JavaScriptCore/runtime/RegExpObjectInlines.h @@ -130,11 +130,11 @@ inline unsigned advanceStringUnicode(String s, unsigned length, unsigned current return currentIndex + 1; UChar first = s[currentIndex]; - if (first < 0xD800 || first > 0xDBFF) + if (!U16_IS_LEAD(first)) return currentIndex + 1; UChar second = s[currentIndex + 1]; - if (second < 0xDC00 || second > 0xDFFF) + if (!U16_IS_TRAIL(second)) return currentIndex + 1; return currentIndex + 2; diff --git a/Source/WTF/wtf/text/StringImpl.cpp b/Source/WTF/wtf/text/StringImpl.cpp index 789b6b49d1eb..0758851c0b54 100644 --- a/Source/WTF/wtf/text/StringImpl.cpp +++ b/Source/WTF/wtf/text/StringImpl.cpp @@ -1573,7 +1573,7 @@ Expected StringImpl::utf8ForCharactersIntoBuffer(co // Conversion fails when there is an unpaired surrogate. // Put replacement character (U+FFFD) instead of the unpaired surrogate. if (result != ConversionResult::Success) { - ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); + ASSERT(U16_IS_SURROGATE(*characters)); // There should be room left, since one UChar hasn't been converted. ASSERT((buffer + 3) <= bufferEnd); putUTF8Triple(buffer, replacementCharacter); @@ -1593,7 +1593,7 @@ Expected StringImpl::utf8ForCharactersIntoBuffer(co if (strict) return makeUnexpected(UTF8ConversionError::SourceExhausted); ASSERT(characters + 1 == charactersEnd); - ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); + ASSERT(U16_IS_SURROGATE(*characters)); putUTF8Triple(buffer, *characters); break; case ConversionResult::TargetExhausted: diff --git a/Source/WebCore/PAL/pal/text/TextCodec.cpp b/Source/WebCore/PAL/pal/text/TextCodec.cpp index 184edcb8df96..9ec7ba40fc27 100644 --- a/Source/WebCore/PAL/pal/text/TextCodec.cpp +++ b/Source/WebCore/PAL/pal/text/TextCodec.cpp @@ -26,6 +26,7 @@ #include "config.h" #include "TextCodec.h" +#include #include #include @@ -39,8 +40,8 @@ int TextCodec::getUnencodableReplacement(UChar32 codePoint, UnencodableHandling // The Encoding Standard doesn't have surrogate code points in the input, but that would require // scanning and potentially manipulating inputs ahead of time. Instead handle them at the last // possible point. - if (codePoint >= 0xD800 && codePoint <= 0xDFFF) - codePoint = 0xFFFD; + if (U_IS_SURROGATE(codePoint)) + codePoint = replacementCharacter; switch (handling) { case UnencodableHandling::Entities: diff --git a/Source/WebCore/css/parser/CSSTokenizer.cpp b/Source/WebCore/css/parser/CSSTokenizer.cpp index b59161fabadc..17451546b079 100644 --- a/Source/WebCore/css/parser/CSSTokenizer.cpp +++ b/Source/WebCore/css/parser/CSSTokenizer.cpp @@ -805,7 +805,7 @@ UChar32 CSSTokenizer::consumeEscape() }; consumeSingleWhitespaceIfNext(); auto codePoint = parseInteger(hexChars, 16).value(); - if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF) + if (!codePoint || U_IS_SURROGATE(codePoint) || codePoint > 0x10FFFF) return replacementCharacter; return codePoint; } diff --git a/Source/WebCore/dom/TextEncoderStreamEncoder.cpp b/Source/WebCore/dom/TextEncoderStreamEncoder.cpp index a104fdf14cc2..20c4234f38c6 100644 --- a/Source/WebCore/dom/TextEncoderStreamEncoder.cpp +++ b/Source/WebCore/dom/TextEncoderStreamEncoder.cpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace WebCore { @@ -44,22 +45,21 @@ RefPtr TextEncoderStreamEncoder::encode(const String& input) for (size_t cptr = 0; cptr < view.length(); cptr++) { // https://encoding.spec.whatwg.org/#convert-code-unit-to-scalar-value auto token = view[cptr]; - if (m_pendingHighSurrogate) { - auto highSurrogate = *m_pendingHighSurrogate; - m_pendingHighSurrogate = std::nullopt; - if (token >= 0xDC00 && token <= 0xDFFF) { - auto codePoint = 0x10000 + ((highSurrogate - 0xD800) << 10) + (token - 0xDC00); + if (m_pendingLeadSurrogate) { + auto leadSurrogate = *std::exchange(m_pendingLeadSurrogate, std::nullopt); + if (U16_IS_TRAIL(token)) { + auto codePoint = U16_GET_SUPPLEMENTARY(leadSurrogate, token); U8_APPEND_UNSAFE(bytes.data(), bytesWritten, codePoint); continue; } - U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD); + U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter); } - if (token >= 0xD800 && token <= 0xDBFF) { - m_pendingHighSurrogate = token; + if (U16_IS_LEAD(token)) { + m_pendingLeadSurrogate = token; continue; } - if (token >= 0xDC00 && token <= 0xDFFF) { - U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD); + if (U16_IS_TRAIL(token)) { + U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter); continue; } U8_APPEND_UNSAFE(bytes.data(), bytesWritten, token); @@ -74,7 +74,7 @@ RefPtr TextEncoderStreamEncoder::encode(const String& input) RefPtr TextEncoderStreamEncoder::flush() { - if (!m_pendingHighSurrogate) + if (!m_pendingLeadSurrogate) return nullptr; constexpr uint8_t byteSequence[] = { 0xEF, 0xBF, 0xBD }; diff --git a/Source/WebCore/dom/TextEncoderStreamEncoder.h b/Source/WebCore/dom/TextEncoderStreamEncoder.h index 89fd207ad498..2a2e00cee4b9 100644 --- a/Source/WebCore/dom/TextEncoderStreamEncoder.h +++ b/Source/WebCore/dom/TextEncoderStreamEncoder.h @@ -42,7 +42,7 @@ class TextEncoderStreamEncoder : public RefCounted { private: TextEncoderStreamEncoder() = default; - std::optional m_pendingHighSurrogate; + std::optional m_pendingLeadSurrogate; }; }