Skip to content
Permalink
Browse files
[JSC] Implement String#isWellFormed and String#toWellFormed
https://bugs.webkit.org/show_bug.cgi?id=248588
rdar://problem/102849210

Reviewed by Ross Kirsling.

This patch implements String#isWellFormed and String#toWellFormed proposal[1].
String#isWellFormed returns true if String does not have wrong surrogate pairs (e.g. non-paired surrogate).
String#toWellFormed returns a string, which replaces non well-formed characters with unicode replacement character (\ufffd).

[1]: https://github.com/tc39/proposal-is-usv-string

* JSTests/stress/string-well-formed.js: Added.
(shouldBe):
* LayoutTests/js/Object-getOwnPropertyNames-expected.txt:
* LayoutTests/js/script-tests/Object-getOwnPropertyNames.js:
* Source/JavaScriptCore/runtime/CommonIdentifiers.h:
* Source/JavaScriptCore/runtime/OptionsList.h:
* Source/JavaScriptCore/runtime/StringPrototype.cpp:
(JSC::StringPrototype::finishCreation):
(JSC::nonWellFormedIndex):
(JSC::JSC_DEFINE_HOST_FUNCTION):

Canonical link: https://commits.webkit.org/257250@main
  • Loading branch information
Constellation committed Dec 1, 2022
1 parent 2c29849 commit 6ccd7405a513dab05d5a59359330af76e177f968
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 2 deletions.
@@ -0,0 +1,32 @@
//@ requireOptions("--useStringWellFormed=1")
function shouldBe(actual, expected) {
if (actual !== expected)
throw new Error('bad value: ' + actual);
}

shouldBe("".isWellFormed(), true);
shouldBe("".toWellFormed(), "");

shouldBe("Hello World".isWellFormed(), true);
shouldBe("Hello World".toWellFormed(), "Hello World");

shouldBe("こんにちわ".isWellFormed(), true);
shouldBe("こんにちわ".toWellFormed(), "こんにちわ");

shouldBe("𠮷野家".isWellFormed(), true);
shouldBe("𠮷野家".toWellFormed(), "𠮷野家");

shouldBe("A\uD842".isWellFormed(), false);
shouldBe("A\uD842".toWellFormed(), "A\uFFFD");

shouldBe("A\uD842A".isWellFormed(), false);
shouldBe("A\uD842A".toWellFormed(), "A\uFFFDA");

shouldBe("A\uD842\uDFB7".isWellFormed(), true);
shouldBe("A\uD842\uDFB7".toWellFormed(), "A\uD842\uDFB7");

shouldBe("A\uDFB7".isWellFormed(), false);
shouldBe("A\uDFB7".toWellFormed(), "A\uFFFD");

shouldBe("A\uDFB7\uD842".isWellFormed(), false);
shouldBe("A\uDFB7\uD842".toWellFormed(), "A\uFFFD\uFFFD");
@@ -49,7 +49,7 @@ PASS getSortedOwnPropertyNames(Function.prototype) is ['apply', 'arguments', 'bi
PASS getSortedOwnPropertyNames(Array) is ['from', 'fromAsync', 'isArray', 'length', 'name', 'of', 'prototype']
PASS getSortedOwnPropertyNames(Array.prototype) is ['at', 'concat', 'constructor', 'copyWithin', 'entries', 'every', 'fill', 'filter', 'find', 'findIndex', 'findLast', 'findLastIndex', 'flat', 'flatMap', 'forEach', 'group', 'groupToMap', 'includes', 'indexOf', 'join', 'keys', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toReversed', 'toSorted', 'toSpliced', 'toString', 'unshift', 'values', 'with']
PASS getSortedOwnPropertyNames(String) is ['fromCharCode', 'fromCodePoint', 'length', 'name', 'prototype', 'raw']
PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'at', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'codePointAt', 'concat', 'constructor', 'endsWith', 'fixed', 'fontcolor', 'fontsize', 'includes', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'matchAll', 'normalize', 'padEnd', 'padStart', 'repeat', 'replace', 'replaceAll', 'search', 'slice', 'small', 'split', 'startsWith', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimEnd', 'trimLeft', 'trimRight', 'trimStart', 'valueOf']
PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'at', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'codePointAt', 'concat', 'constructor', 'endsWith', 'fixed', 'fontcolor', 'fontsize', 'includes', 'indexOf', 'isWellFormed', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'matchAll', 'normalize', 'padEnd', 'padStart', 'repeat', 'replace', 'replaceAll', 'search', 'slice', 'small', 'split', 'startsWith', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'toWellFormed', 'trim', 'trimEnd', 'trimLeft', 'trimRight', 'trimStart', 'valueOf']
PASS getSortedOwnPropertyNames(Boolean) is ['length', 'name', 'prototype']
PASS getSortedOwnPropertyNames(Boolean.prototype) is ['constructor', 'toString', 'valueOf']
PASS getSortedOwnPropertyNames(Number) is ['EPSILON', 'MAX_SAFE_INTEGER', 'MAX_VALUE', 'MIN_SAFE_INTEGER', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'NaN', 'POSITIVE_INFINITY', 'isFinite', 'isInteger', 'isNaN', 'isSafeInteger', 'length', 'name', 'parseFloat', 'parseInt', 'prototype']
@@ -58,7 +58,7 @@ var expectedPropertyNamesSet = {
"Array": "['from', 'fromAsync', 'isArray', 'length', 'name', 'of', 'prototype']",
"Array.prototype": "['at', 'concat', 'constructor', 'copyWithin', 'entries', 'every', 'fill', 'filter', 'find', 'findIndex', 'findLast', 'findLastIndex', 'flat', 'flatMap', 'forEach', 'group', 'groupToMap', 'includes', 'indexOf', 'join', 'keys', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toReversed', 'toSorted', 'toSpliced', 'toString', 'unshift', 'values', 'with']",
"String": "['fromCharCode', 'fromCodePoint', 'length', 'name', 'prototype', 'raw']",
"String.prototype": "['anchor', 'at', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'codePointAt', 'concat', 'constructor', 'endsWith', 'fixed', 'fontcolor', 'fontsize', 'includes', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'matchAll', 'normalize', 'padEnd', 'padStart', 'repeat', 'replace', 'replaceAll', 'search', 'slice', 'small', 'split', 'startsWith', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimEnd', 'trimLeft', 'trimRight', 'trimStart', 'valueOf']",
"String.prototype": "['anchor', 'at', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'codePointAt', 'concat', 'constructor', 'endsWith', 'fixed', 'fontcolor', 'fontsize', 'includes', 'indexOf', 'isWellFormed', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'matchAll', 'normalize', 'padEnd', 'padStart', 'repeat', 'replace', 'replaceAll', 'search', 'slice', 'small', 'split', 'startsWith', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'toWellFormed', 'trim', 'trimEnd', 'trimLeft', 'trimRight', 'trimStart', 'valueOf']",
"Boolean": "['length', 'name', 'prototype']",
"Boolean.prototype": "['constructor', 'toString', 'valueOf']",
"Number": "['EPSILON', 'MAX_SAFE_INTEGER', 'MAX_VALUE', 'MIN_SAFE_INTEGER', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'NaN', 'POSITIVE_INFINITY', 'isFinite', 'isInteger', 'isNaN', 'isSafeInteger', 'length', 'name', 'parseFloat', 'parseInt', 'prototype']",
@@ -171,6 +171,7 @@
macro(isPrototypeOf) \
macro(isView) \
macro(isWatchpoint) \
macro(isWellFormed) \
macro(isWordLike) \
macro(jettisonReason) \
macro(join) \
@@ -270,6 +271,7 @@
macro(toPrecision) \
macro(toString) \
macro(toTemporalInstant) \
macro(toWellFormed) \
macro(trailingZeroDisplay) \
macro(transfer) \
macro(type) \
@@ -553,6 +553,7 @@ bool canUseWebAssemblyFastMemory();
v(Bool, useResizableArrayBuffer, true, Normal, "Expose ResizableArrayBuffer feature.") \
v(Bool, useSharedArrayBuffer, false, Normal, nullptr) \
v(Bool, useShadowRealm, false, Normal, "Expose the ShadowRealm object.") \
v(Bool, useStringWellFormed, true, Normal, "Expose the String well-formed methods.") \
v(Bool, useTemporal, false, Normal, "Expose the Temporal object.") \
v(Bool, useWebAssemblyThreading, true, Normal, "Allow instructions from the wasm threading spec.") \
v(Bool, useWebAssemblyTypedFunctionReferences, false, Normal, "Allow function types from the wasm typed function references spec.") \
@@ -77,6 +77,8 @@ static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncEndsWith);
static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncIncludes);
static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncNormalize);
static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncIterator);
static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncIsWellFormed);
static JSC_DECLARE_HOST_FUNCTION(stringProtoFuncToWellFormed);

}

@@ -164,6 +166,11 @@ void StringPrototype::finishCreation(VM& vm, JSGlobalObject* globalObject)
JSC_NATIVE_FUNCTION_WITHOUT_TRANSITION(vm.propertyNames->builtinNames().substrPrivateName(), stringProtoFuncSubstr, static_cast<unsigned>(PropertyAttribute::DontEnum), 2, ImplementationVisibility::Public);
JSC_NATIVE_FUNCTION_WITHOUT_TRANSITION(vm.propertyNames->builtinNames().endsWithPrivateName(), stringProtoFuncEndsWith, static_cast<unsigned>(PropertyAttribute::DontEnum), 2, ImplementationVisibility::Public);

if (Options::useStringWellFormed()) {
JSC_NATIVE_FUNCTION_WITHOUT_TRANSITION(vm.propertyNames->isWellFormed, stringProtoFuncIsWellFormed, static_cast<unsigned>(PropertyAttribute::DontEnum), 0, ImplementationVisibility::Public);
JSC_NATIVE_FUNCTION_WITHOUT_TRANSITION(vm.propertyNames->toWellFormed, stringProtoFuncToWellFormed, static_cast<unsigned>(PropertyAttribute::DontEnum), 0, ImplementationVisibility::Public);
}

// The constructor will be added later, after StringConstructor has been built
}

@@ -1757,4 +1764,117 @@ JSC_DEFINE_HOST_FUNCTION(stringProtoFuncNormalize, (JSGlobalObject* globalObject
RELEASE_AND_RETURN(scope, JSValue::encode(normalize(globalObject, string, form)));
}

static inline std::optional<unsigned> illFormedIndex(const UChar* characters, unsigned length)
{
for (unsigned index = 0; index < length; ++index) {
UChar character = characters[index];
if (!U16_IS_SURROGATE(character))
continue;

if (U16_IS_SURROGATE_TRAIL(character))
return index;

ASSERT(U16_IS_SURROGATE_LEAD(character));
if ((index + 1) == length)
return index;
UChar nextCharacter = characters[index + 1];

if (!U16_IS_SURROGATE(nextCharacter))
return index;

if (!U16_IS_SURROGATE_TRAIL(nextCharacter))
return index;

++index; // Increment additionally.
}
return std::nullopt;
}

JSC_DEFINE_HOST_FUNCTION(stringProtoFuncIsWellFormed, (JSGlobalObject* globalObject, CallFrame* callFrame))
{
VM& vm = globalObject->vm();
auto scope = DECLARE_THROW_SCOPE(vm);

JSValue thisValue = callFrame->thisValue();
if (!checkObjectCoercible(thisValue))
return throwVMTypeError(globalObject, scope);

// Latin-1 characters do not have surrogates.
if (thisValue.isString() && asString(thisValue)->is8Bit())
return JSValue::encode(jsBoolean(true));

String string = thisValue.toWTFString(globalObject);
RETURN_IF_EXCEPTION(scope, { });

if (string.is8Bit())
return JSValue::encode(jsBoolean(true));
return JSValue::encode(jsBoolean(!illFormedIndex(string.characters16(), string.length())));
}

JSC_DEFINE_HOST_FUNCTION(stringProtoFuncToWellFormed, (JSGlobalObject* globalObject, CallFrame* callFrame))
{
VM& vm = globalObject->vm();
auto scope = DECLARE_THROW_SCOPE(vm);

JSValue thisValue = callFrame->thisValue();
if (!checkObjectCoercible(thisValue))
return throwVMTypeError(globalObject, scope);

// Latin-1 characters do not have surrogates.
if (thisValue.isString() && asString(thisValue)->is8Bit())
return JSValue::encode(thisValue);

String string = thisValue.toWTFString(globalObject);
RETURN_IF_EXCEPTION(scope, { });

if (string.is8Bit())
return JSValue::encode(thisValue);

const UChar* characters = string.characters16();
unsigned length = string.length();
auto firstIllFormedIndex = illFormedIndex(characters, length);
if (!firstIllFormedIndex)
return JSValue::encode(thisValue);

Vector<UChar> buffer;
buffer.reserveInitialCapacity(length);
buffer.append(characters, firstIllFormedIndex.value());
for (unsigned index = firstIllFormedIndex.value(); index < length; ++index) {
UChar character = characters[index];

if (!U16_IS_SURROGATE(character)) {
buffer.append(character);
continue;
}

if (U16_IS_SURROGATE_TRAIL(character)) {
buffer.append(replacementCharacter);
continue;
}

ASSERT(U16_IS_SURROGATE_LEAD(character));
if ((index + 1) == length) {
buffer.append(replacementCharacter);
continue;
}
UChar nextCharacter = characters[index + 1];

if (!U16_IS_SURROGATE(nextCharacter)) {
buffer.append(replacementCharacter);
continue;
}

if (!U16_IS_SURROGATE_TRAIL(nextCharacter)) {
buffer.append(replacementCharacter);
continue;
}

buffer.append(character);
buffer.append(nextCharacter);
index += 1;
}

return JSValue::encode(jsString(vm, String::adopt(WTFMove(buffer))));
}

} // namespace JSC

0 comments on commit 6ccd740

Please sign in to comment.