From b58c8bed4445a0192e15f9bdc67bd0f0c45d4ea2 Mon Sep 17 00:00:00 2001
From: Anne van Kesteren <annevk@annevk.nl>
Date: Tue, 4 Apr 2023 10:22:04 -0700
Subject: [PATCH] Use more ICU surrogate code point macros
 https://bugs.webkit.org/show_bug.cgi?id=254984 rdar://107602109

Reviewed by Darin Adler.

Make WebKit rely more on centrally defined checks and constants for code points, in particular for surrogates.

* Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp:
(JSC::JSImmutableButterfly::createFromString):
* Source/JavaScriptCore/runtime/RegExpObjectInlines.h:
(JSC::advanceStringUnicode):
* Source/WTF/wtf/text/StringImpl.cpp:
(WTF::StringImpl::utf8ForCharactersIntoBuffer):
* Source/WebCore/PAL/pal/text/TextCodec.cpp:
(PAL::TextCodec::getUnencodableReplacement):
* Source/WebCore/css/parser/CSSTokenizer.cpp:
(WebCore::CSSTokenizer::consumeEscape):
* Source/WebCore/dom/TextEncoderStreamEncoder.cpp:
(WebCore::TextEncoderStreamEncoder::encode):
(WebCore::TextEncoderStreamEncoder::flush):
* Source/WebCore/dom/TextEncoderStreamEncoder.h:

Canonical link: https://commits.webkit.org/262581@main
---
 .../runtime/JSImmutableButterfly.cpp          |  4 ++--
 .../runtime/RegExpObjectInlines.h             |  4 ++--
 Source/WTF/wtf/text/StringImpl.cpp            |  4 ++--
 Source/WebCore/PAL/pal/text/TextCodec.cpp     |  5 +++--
 Source/WebCore/css/parser/CSSTokenizer.cpp    |  2 +-
 .../WebCore/dom/TextEncoderStreamEncoder.cpp  | 22 +++++++++----------
 Source/WebCore/dom/TextEncoderStreamEncoder.h |  2 +-
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp b/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp
index 98e9b817c7c0..be59bfa87cf3 100644
--- a/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp
+++ b/Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp
@@ -183,13 +183,13 @@ JSImmutableButterfly* JSImmutableButterfly::createFromString(JSGlobalObject* glo
     auto forEachCodePointViaStringIteratorProtocol = [](const UChar* characters, unsigned length, auto func) {
         for (unsigned i = 0; i < length; ++i) {
             UChar character = characters[i];
-            if (character < 0xD800 || character > 0xDBFF || (i + 1) == length) {
+            if (!U16_IS_LEAD(character) || (i + 1) == length) {
                 if (func(i, 1) == IterationStatus::Done)
                     return;
                 continue;
             }
             UChar second = characters[i + 1];
-            if (second < 0xDC00 || second > 0xDFFF) {
+            if (!U16_IS_TRAIL(second)) {
                 if (func(i, 1) == IterationStatus::Done)
                     return;
                 continue;
diff --git a/Source/JavaScriptCore/runtime/RegExpObjectInlines.h b/Source/JavaScriptCore/runtime/RegExpObjectInlines.h
index e56d0ab74938..968738b23b2a 100644
--- a/Source/JavaScriptCore/runtime/RegExpObjectInlines.h
+++ b/Source/JavaScriptCore/runtime/RegExpObjectInlines.h
@@ -130,11 +130,11 @@ inline unsigned advanceStringUnicode(String s, unsigned length, unsigned current
         return currentIndex + 1;
 
     UChar first = s[currentIndex];
-    if (first < 0xD800 || first > 0xDBFF)
+    if (!U16_IS_LEAD(first))
         return currentIndex + 1;
 
     UChar second = s[currentIndex + 1];
-    if (second < 0xDC00 || second > 0xDFFF)
+    if (!U16_IS_TRAIL(second))
         return currentIndex + 1;
 
     return currentIndex + 2;
diff --git a/Source/WTF/wtf/text/StringImpl.cpp b/Source/WTF/wtf/text/StringImpl.cpp
index 789b6b49d1eb..0758851c0b54 100644
--- a/Source/WTF/wtf/text/StringImpl.cpp
+++ b/Source/WTF/wtf/text/StringImpl.cpp
@@ -1573,7 +1573,7 @@ Expected<size_t, UTF8ConversionError> StringImpl::utf8ForCharactersIntoBuffer(co
             // Conversion fails when there is an unpaired surrogate.
             // Put replacement character (U+FFFD) instead of the unpaired surrogate.
             if (result != ConversionResult::Success) {
-                ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
+                ASSERT(U16_IS_SURROGATE(*characters));
                 // There should be room left, since one UChar hasn't been converted.
                 ASSERT((buffer + 3) <= bufferEnd);
                 putUTF8Triple(buffer, replacementCharacter);
@@ -1593,7 +1593,7 @@ Expected<size_t, UTF8ConversionError> StringImpl::utf8ForCharactersIntoBuffer(co
             if (strict)
                 return makeUnexpected(UTF8ConversionError::SourceExhausted);
             ASSERT(characters + 1 == charactersEnd);
-            ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
+            ASSERT(U16_IS_SURROGATE(*characters));
             putUTF8Triple(buffer, *characters);
                 break;
         case ConversionResult::TargetExhausted:
diff --git a/Source/WebCore/PAL/pal/text/TextCodec.cpp b/Source/WebCore/PAL/pal/text/TextCodec.cpp
index 184edcb8df96..9ec7ba40fc27 100644
--- a/Source/WebCore/PAL/pal/text/TextCodec.cpp
+++ b/Source/WebCore/PAL/pal/text/TextCodec.cpp
@@ -26,6 +26,7 @@
 
 #include "config.h"
 #include "TextCodec.h"
+#include <wtf/unicode/CharacterNames.h>
 
 #include <array>
 #include <cstdio>
@@ -39,8 +40,8 @@ int TextCodec::getUnencodableReplacement(UChar32 codePoint, UnencodableHandling
     // The Encoding Standard doesn't have surrogate code points in the input, but that would require
     // scanning and potentially manipulating inputs ahead of time. Instead handle them at the last
     // possible point.
-    if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
-        codePoint = 0xFFFD;
+    if (U_IS_SURROGATE(codePoint))
+        codePoint = replacementCharacter;
 
     switch (handling) {
     case UnencodableHandling::Entities:
diff --git a/Source/WebCore/css/parser/CSSTokenizer.cpp b/Source/WebCore/css/parser/CSSTokenizer.cpp
index b59161fabadc..17451546b079 100644
--- a/Source/WebCore/css/parser/CSSTokenizer.cpp
+++ b/Source/WebCore/css/parser/CSSTokenizer.cpp
@@ -805,7 +805,7 @@ UChar32 CSSTokenizer::consumeEscape()
         };
         consumeSingleWhitespaceIfNext();
         auto codePoint = parseInteger<uint32_t>(hexChars, 16).value();
-        if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
+        if (!codePoint || U_IS_SURROGATE(codePoint) || codePoint > 0x10FFFF)
             return replacementCharacter;
         return codePoint;
     }
diff --git a/Source/WebCore/dom/TextEncoderStreamEncoder.cpp b/Source/WebCore/dom/TextEncoderStreamEncoder.cpp
index a104fdf14cc2..20c4234f38c6 100644
--- a/Source/WebCore/dom/TextEncoderStreamEncoder.cpp
+++ b/Source/WebCore/dom/TextEncoderStreamEncoder.cpp
@@ -28,6 +28,7 @@
 #include <JavaScriptCore/GenericTypedArrayViewInlines.h>
 #include <JavaScriptCore/JSCInlines.h>
 #include <JavaScriptCore/JSGenericTypedArrayViewInlines.h>
+#include <wtf/unicode/CharacterNames.h>
 
 namespace WebCore {
 
@@ -44,22 +45,21 @@ RefPtr<Uint8Array> TextEncoderStreamEncoder::encode(const String& input)
     for (size_t cptr = 0; cptr < view.length(); cptr++) {
         // https://encoding.spec.whatwg.org/#convert-code-unit-to-scalar-value
         auto token = view[cptr];
-        if (m_pendingHighSurrogate) {
-            auto highSurrogate = *m_pendingHighSurrogate;
-            m_pendingHighSurrogate = std::nullopt;
-            if (token >= 0xDC00 && token <= 0xDFFF) {
-                auto codePoint = 0x10000 + ((highSurrogate - 0xD800) << 10) + (token - 0xDC00);
+        if (m_pendingLeadSurrogate) {
+            auto leadSurrogate = *std::exchange(m_pendingLeadSurrogate, std::nullopt);
+            if (U16_IS_TRAIL(token)) {
+                auto codePoint = U16_GET_SUPPLEMENTARY(leadSurrogate, token);
                 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, codePoint);
                 continue;
             }
-            U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD);
+            U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter);
         }
-        if (token >= 0xD800 && token <= 0xDBFF) {
-            m_pendingHighSurrogate = token;
+        if (U16_IS_LEAD(token)) {
+            m_pendingLeadSurrogate = token;
             continue;
         }
-        if (token >= 0xDC00 && token <= 0xDFFF) {
-            U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD);
+        if (U16_IS_TRAIL(token)) {
+            U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter);
             continue;
         }
         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, token);
@@ -74,7 +74,7 @@ RefPtr<Uint8Array> TextEncoderStreamEncoder::encode(const String& input)
 
 RefPtr<Uint8Array> TextEncoderStreamEncoder::flush()
 {
-    if (!m_pendingHighSurrogate)
+    if (!m_pendingLeadSurrogate)
         return nullptr;
 
     constexpr uint8_t byteSequence[] = { 0xEF, 0xBF, 0xBD };
diff --git a/Source/WebCore/dom/TextEncoderStreamEncoder.h b/Source/WebCore/dom/TextEncoderStreamEncoder.h
index 89fd207ad498..2a2e00cee4b9 100644
--- a/Source/WebCore/dom/TextEncoderStreamEncoder.h
+++ b/Source/WebCore/dom/TextEncoderStreamEncoder.h
@@ -42,7 +42,7 @@ class TextEncoderStreamEncoder : public RefCounted<TextEncoderStreamEncoder> {
 private:
     TextEncoderStreamEncoder() = default;
 
-    std::optional<UChar> m_pendingHighSurrogate;
+    std::optional<UChar> m_pendingLeadSurrogate;
 };
 
 }