Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[JSC] Improve String#split for uglify-js-wtb
https://bugs.webkit.org/show_bug.cgi?id=251823
rdar://105104781

Reviewed by Michael Saboff.

This patch improves JetStream2/uglify-js-wtb by 1% with String#split optimizations.

1. We should just use ArrayWithContigous array for result of String#split, and let's say `1` capacity at least.
   At that point, we will always return an array with at least one capacity. So, let's avoid structure transition,
   and butterfly reallocation.
2. We should search for BoyerMoore lookahead character patterns with Greedy one-character pattern. For example,
   /\r?\n/ is very frequently used. We should search for [\r\n] character to start matching in this case.
   However, if we have /\r?\n/ pattern,
       1. first character can be [\r\n]
       2. second character can be null, or [\n]
   So, this confuses fixed-sized BoyerMoore lookahead generation. So, when we encounter greedy one-character pattern, we cut
   the BM prefix length at this point. So, in the above case, we only consider about first character [\r\n] case.
   This is still beneficial since previously we give up completely when we encounter greedy fixed-sized one-character pattern.

                                   ToT                     Patched

    string-split-space      130.3261+-0.7798     ^    125.6028+-0.4402        ^ definitely 1.0376x faster
    string-split            206.8225+-1.1932     ^    128.5716+-0.7137        ^ definitely 1.6086x faster

* JSTests/microbenchmarks/string-split-space.js: Added.
(split):
* JSTests/microbenchmarks/string-split.js: Added.
(split):
* Source/JavaScriptCore/runtime/RegExpPrototype.cpp:
(JSC::JSC_DEFINE_HOST_FUNCTION):
* Source/JavaScriptCore/yarr/YarrJIT.cpp:

Canonical link: https://commits.webkit.org/259941@main
  • Loading branch information
Constellation committed Feb 7, 2023
1 parent 35591d2 commit 8b7e7ed
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 18 deletions.
11 changes: 11 additions & 0 deletions JSTests/microbenchmarks/string-split-space.js
@@ -0,0 +1,11 @@
var string = `Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.`;

function split(string, regexp)
{
return string.split(regexp);
}
noInline(split);

var regexp = / /;
for (var i = 0; i < 1e5; ++i)
split(string, regexp);
79 changes: 79 additions & 0 deletions JSTests/microbenchmarks/string-split.js
@@ -0,0 +1,79 @@
var string = `Lorem
ipsum
dolor
sit
amet,
consectetur
adipiscing
elit,
sed
do
eiusmod
tempor
incididunt
ut
labore
et
dolore
magna
aliqua.
Ut
enim
ad
minim
veniam,
quis
nostrud
exercitation
ullamco
laboris
nisi
ut
aliquip
ex
ea
commodo
consequat.
Duis
aute
irure
dolor
in
reprehenderit
in
voluptate
velit
esse
cillum
dolore
eu
fugiat
nulla
pariatur.
Excepteur
sint
occaecat
cupidatat
non
proident,
sunt
in
culpa
qui
officia
deserunt
mollit
anim
id
est
laborum.`;

function split(string, regexp)
{
return string.split(regexp);
}
noInline(split);

var regexp = /\r?\n/;
for (var i = 0; i < 1e5; ++i)
split(string, regexp);
27 changes: 16 additions & 11 deletions Source/JavaScriptCore/runtime/RegExpPrototype.cpp
Expand Up @@ -492,7 +492,7 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject
// 3. [handled by JS builtin] Let S be ? ToString(string).
JSString* inputString = callFrame->argument(0).toString(globalObject);
String input = inputString->value(globalObject);
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });
ASSERT(!input.isNull());

// 4. [handled by JS builtin] Let C be ? SpeciesConstructor(rx, %RegExp%).
Expand All @@ -505,14 +505,12 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject

// 11. Let A be ArrayCreate(0).
// 12. Let lengthA be 0.
JSArray* result = constructEmptyArray(globalObject, nullptr);
RETURN_IF_EXCEPTION(scope, encodedJSValue());
unsigned resultLength = 0;

// 13. If limit is undefined, let lim be 2^32-1; else let lim be ? ToUint32(limit).
JSValue limitValue = callFrame->argument(1);
unsigned limit = limitValue.isUndefined() ? 0xFFFFFFFFu : limitValue.toUInt32(globalObject);
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });

// 14. Let size be the number of elements in S.
unsigned inputSize = input.length();
Expand All @@ -522,19 +520,21 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject

// 16. If lim == 0, return A.
if (!limit)
return JSValue::encode(result);
RELEASE_AND_RETURN(scope, JSValue::encode(constructEmptyArray(globalObject, nullptr)));

// 17. If size == 0, then
if (input.isEmpty()) {
// a. Let z be ? RegExpExec(splitter, S).
// b. If z is not null, return A.
// c. Perform ! CreateDataProperty(A, "0", S).
// d. Return A.
JSArray* result = constructEmptyArray(globalObject, nullptr);
RETURN_IF_EXCEPTION(scope, { });
auto matchResult = regexp->match(globalObject, input, 0);
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });
if (!matchResult) {
result->putDirectIndex(globalObject, 0, inputString);
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });
}
return JSValue::encode(result);
}
Expand All @@ -546,6 +546,11 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject
bool regExpIsUnicode = regexp->unicode();

unsigned maxSizeForDirectPath = 100000;
JSArray* result = JSArray::tryCreate(vm, globalObject->arrayStructureForIndexingTypeDuringAllocation(ArrayWithContiguous), 1);
if (UNLIKELY(!result)) {
throwOutOfMemoryError(globalObject, scope);
return { };
}

genericSplit(
globalObject, regexp, input, inputSize, position, matchPosition, regExpIsSticky, regExpIsUnicode,
Expand All @@ -561,7 +566,7 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject
return AbortSplit;
return ContinueSplit;
});
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });

if (resultLength >= limit)
return JSValue::encode(result);
Expand Down Expand Up @@ -592,11 +597,11 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject
return AbortSplit;
return ContinueSplit;
});
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });

if (resultLength + dryRunCount > MAX_STORAGE_VECTOR_LENGTH) {
throwOutOfMemoryError(globalObject, scope);
return encodedJSValue();
return { };
}

// OK, we know that if we finish the split, we won't have to OOM.
Expand All @@ -615,7 +620,7 @@ JSC_DEFINE_HOST_FUNCTION(regExpProtoFuncSplitFast, (JSGlobalObject* globalObject
return AbortSplit;
return ContinueSplit;
});
RETURN_IF_EXCEPTION(scope, encodedJSValue());
RETURN_IF_EXCEPTION(scope, { });

if (resultLength >= limit)
return JSValue::encode(result);
Expand Down
35 changes: 28 additions & 7 deletions Source/JavaScriptCore/yarr/YarrJIT.cpp
Expand Up @@ -3890,14 +3890,21 @@ class YarrGenerator final : public YarrJITInfo {
case PatternTerm::Type::DotStarEnclosure:
break;
case PatternTerm::Type::CharacterClass: {
if (term.quantityType != QuantifierType::FixedCount || term.quantityMaxCount != 1)
if (term.quantityType != QuantifierType::FixedCount && term.quantityType != QuantifierType::Greedy)
break;
if (term.inputPosition != index)
if (term.quantityMaxCount != 1)
break;
if (term.inputPosition != cursor)
break;
auto& characterClass = *term.characterClass;
if (term.invert() || characterClass.m_anyCharacter) {
bmInfo.setAll(cursor);
++cursor;
// If this is greedy one-character pattern "a?", we should not increase cursor.
// If we see greedy pattern, then we cut bmInfo here to avoid possibility explosion.
if (term.quantityType == QuantifierType::FixedCount)
++cursor;
else
bmInfo.shortenLength(cursor + 1);
continue;
}
if (!characterClass.m_rangesUnicode.isEmpty())
Expand All @@ -3908,13 +3915,21 @@ class YarrGenerator final : public YarrJITInfo {
bmInfo.addRanges(cursor, characterClass.m_ranges);
if (!characterClass.m_matches.isEmpty())
bmInfo.addCharacters(cursor, characterClass.m_matches);
++cursor;

// If this is greedy one-character pattern "a?", we should not increase cursor.
// If we see greedy pattern, then we cut bmInfo here to avoid possibility explosion.
if (term.quantityType == QuantifierType::FixedCount)
++cursor;
else
bmInfo.shortenLength(cursor + 1);
continue;
}
case PatternTerm::Type::PatternCharacter: {
if (term.quantityType != QuantifierType::FixedCount || term.quantityMaxCount != 1)
if (term.quantityType != QuantifierType::FixedCount && term.quantityType != QuantifierType::Greedy)
break;
if (term.quantityMaxCount != 1)
break;
if (term.inputPosition != index)
if (term.inputPosition != cursor)
break;
if (U16_LENGTH(term.patternCharacter) != 1 && m_decodeSurrogatePairs)
break;
Expand All @@ -3926,7 +3941,13 @@ class YarrGenerator final : public YarrJITInfo {
bmInfo.set(cursor, toASCIILower(term.patternCharacter));
} else
bmInfo.set(cursor, term.patternCharacter);
++cursor;

// If this is greedy one-character pattern "a?", we should not increase cursor.
// If we see greedy pattern, then we cut bmInfo here to avoid possibility explosion.
if (term.quantityType == QuantifierType::FixedCount)
++cursor;
else
bmInfo.shortenLength(cursor + 1);
continue;
}
}
Expand Down

0 comments on commit 8b7e7ed

Please sign in to comment.