Skip to content

Commit

Permalink
Use more ICU surrogate code point macros
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=254984
rdar://107602109

Reviewed by Darin Adler.

Make WebKit rely more on centrally defined checks and constants for code points, in particular for surrogates.

* Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp:
(JSC::JSImmutableButterfly::createFromString):
* Source/JavaScriptCore/runtime/RegExpObjectInlines.h:
(JSC::advanceStringUnicode):
* Source/WTF/wtf/text/StringImpl.cpp:
(WTF::StringImpl::utf8ForCharactersIntoBuffer):
* Source/WebCore/PAL/pal/text/TextCodec.cpp:
(PAL::TextCodec::getUnencodableReplacement):
* Source/WebCore/css/parser/CSSTokenizer.cpp:
(WebCore::CSSTokenizer::consumeEscape):
* Source/WebCore/dom/TextEncoderStreamEncoder.cpp:
(WebCore::TextEncoderStreamEncoder::encode):
(WebCore::TextEncoderStreamEncoder::flush):
* Source/WebCore/dom/TextEncoderStreamEncoder.h:

Canonical link: https://commits.webkit.org/262581@main
  • Loading branch information
annevk authored and Ahmad Saleem committed Apr 4, 2023
1 parent 7f3a0f6 commit b58c8be
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 21 deletions.
4 changes: 2 additions & 2 deletions Source/JavaScriptCore/runtime/JSImmutableButterfly.cpp
Expand Up @@ -183,13 +183,13 @@ JSImmutableButterfly* JSImmutableButterfly::createFromString(JSGlobalObject* glo
auto forEachCodePointViaStringIteratorProtocol = [](const UChar* characters, unsigned length, auto func) {
for (unsigned i = 0; i < length; ++i) {
UChar character = characters[i];
if (character < 0xD800 || character > 0xDBFF || (i + 1) == length) {
if (!U16_IS_LEAD(character) || (i + 1) == length) {
if (func(i, 1) == IterationStatus::Done)
return;
continue;
}
UChar second = characters[i + 1];
if (second < 0xDC00 || second > 0xDFFF) {
if (!U16_IS_TRAIL(second)) {
if (func(i, 1) == IterationStatus::Done)
return;
continue;
Expand Down
4 changes: 2 additions & 2 deletions Source/JavaScriptCore/runtime/RegExpObjectInlines.h
Expand Up @@ -130,11 +130,11 @@ inline unsigned advanceStringUnicode(String s, unsigned length, unsigned current
return currentIndex + 1;

UChar first = s[currentIndex];
if (first < 0xD800 || first > 0xDBFF)
if (!U16_IS_LEAD(first))
return currentIndex + 1;

UChar second = s[currentIndex + 1];
if (second < 0xDC00 || second > 0xDFFF)
if (!U16_IS_TRAIL(second))
return currentIndex + 1;

return currentIndex + 2;
Expand Down
4 changes: 2 additions & 2 deletions Source/WTF/wtf/text/StringImpl.cpp
Expand Up @@ -1573,7 +1573,7 @@ Expected<size_t, UTF8ConversionError> StringImpl::utf8ForCharactersIntoBuffer(co
// Conversion fails when there is an unpaired surrogate.
// Put replacement character (U+FFFD) instead of the unpaired surrogate.
if (result != ConversionResult::Success) {
ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
ASSERT(U16_IS_SURROGATE(*characters));
// There should be room left, since one UChar hasn't been converted.
ASSERT((buffer + 3) <= bufferEnd);
putUTF8Triple(buffer, replacementCharacter);
Expand All @@ -1593,7 +1593,7 @@ Expected<size_t, UTF8ConversionError> StringImpl::utf8ForCharactersIntoBuffer(co
if (strict)
return makeUnexpected(UTF8ConversionError::SourceExhausted);
ASSERT(characters + 1 == charactersEnd);
ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
ASSERT(U16_IS_SURROGATE(*characters));
putUTF8Triple(buffer, *characters);
break;
case ConversionResult::TargetExhausted:
Expand Down
5 changes: 3 additions & 2 deletions Source/WebCore/PAL/pal/text/TextCodec.cpp
Expand Up @@ -26,6 +26,7 @@

#include "config.h"
#include "TextCodec.h"
#include <wtf/unicode/CharacterNames.h>

#include <array>
#include <cstdio>
Expand All @@ -39,8 +40,8 @@ int TextCodec::getUnencodableReplacement(UChar32 codePoint, UnencodableHandling
// The Encoding Standard doesn't have surrogate code points in the input, but that would require
// scanning and potentially manipulating inputs ahead of time. Instead handle them at the last
// possible point.
if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
codePoint = 0xFFFD;
if (U_IS_SURROGATE(codePoint))
codePoint = replacementCharacter;

switch (handling) {
case UnencodableHandling::Entities:
Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/css/parser/CSSTokenizer.cpp
Expand Up @@ -805,7 +805,7 @@ UChar32 CSSTokenizer::consumeEscape()
};
consumeSingleWhitespaceIfNext();
auto codePoint = parseInteger<uint32_t>(hexChars, 16).value();
if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
if (!codePoint || U_IS_SURROGATE(codePoint) || codePoint > 0x10FFFF)
return replacementCharacter;
return codePoint;
}
Expand Down
22 changes: 11 additions & 11 deletions Source/WebCore/dom/TextEncoderStreamEncoder.cpp
Expand Up @@ -28,6 +28,7 @@
#include <JavaScriptCore/GenericTypedArrayViewInlines.h>
#include <JavaScriptCore/JSCInlines.h>
#include <JavaScriptCore/JSGenericTypedArrayViewInlines.h>
#include <wtf/unicode/CharacterNames.h>

namespace WebCore {

Expand All @@ -44,22 +45,21 @@ RefPtr<Uint8Array> TextEncoderStreamEncoder::encode(const String& input)
for (size_t cptr = 0; cptr < view.length(); cptr++) {
// https://encoding.spec.whatwg.org/#convert-code-unit-to-scalar-value
auto token = view[cptr];
if (m_pendingHighSurrogate) {
auto highSurrogate = *m_pendingHighSurrogate;
m_pendingHighSurrogate = std::nullopt;
if (token >= 0xDC00 && token <= 0xDFFF) {
auto codePoint = 0x10000 + ((highSurrogate - 0xD800) << 10) + (token - 0xDC00);
if (m_pendingLeadSurrogate) {
auto leadSurrogate = *std::exchange(m_pendingLeadSurrogate, std::nullopt);
if (U16_IS_TRAIL(token)) {
auto codePoint = U16_GET_SUPPLEMENTARY(leadSurrogate, token);
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, codePoint);
continue;
}
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD);
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter);
}
if (token >= 0xD800 && token <= 0xDBFF) {
m_pendingHighSurrogate = token;
if (U16_IS_LEAD(token)) {
m_pendingLeadSurrogate = token;
continue;
}
if (token >= 0xDC00 && token <= 0xDFFF) {
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, 0XFFFD);
if (U16_IS_TRAIL(token)) {
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, replacementCharacter);
continue;
}
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, token);
Expand All @@ -74,7 +74,7 @@ RefPtr<Uint8Array> TextEncoderStreamEncoder::encode(const String& input)

RefPtr<Uint8Array> TextEncoderStreamEncoder::flush()
{
if (!m_pendingHighSurrogate)
if (!m_pendingLeadSurrogate)
return nullptr;

constexpr uint8_t byteSequence[] = { 0xEF, 0xBF, 0xBD };
Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/dom/TextEncoderStreamEncoder.h
Expand Up @@ -42,7 +42,7 @@ class TextEncoderStreamEncoder : public RefCounted<TextEncoderStreamEncoder> {
private:
TextEncoderStreamEncoder() = default;

std::optional<UChar> m_pendingHighSurrogate;
std::optional<UChar> m_pendingLeadSurrogate;
};

}

0 comments on commit b58c8be

Please sign in to comment.