Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[JSC] Introduce SubjectSampler to heuristically pick BM search in RegExp
https://bugs.webkit.org/show_bug.cgi?id=252065
rdar://105284820

Reviewed by Mark Lam.

BoyerMoore search's effectiveness depends on whether we can pick a good anchor which rarely appears on the actual text.
And if we pick a character which appears super frequently in text, then it does not have much effectiveness or rather
slows down RegExp performance since BM search adds additional searching code.

So in this patch, we integrate an idea of V8 Irregexp, which samples 128 characters of a text at compile time and use
character frequency as a weight to pick better BoyerMoore search character. Our weight calculation is simpler one than V8,
and it is effective in our benchmarks.

This patch improves JetStream2/regex-dna-SP by 5-10%.

* Source/JavaScriptCore/runtime/RegExp.cpp:
(JSC::RegExp::compile):
(JSC::RegExp::compileMatchOnly):
* Source/JavaScriptCore/runtime/RegExp.h:
* Source/JavaScriptCore/runtime/RegExpInlines.h:
(JSC::RegExp::compileIfNecessary):
(JSC::RegExp::matchInline):
(JSC::RegExp::compileIfNecessaryMatchOnly):
* Source/JavaScriptCore/yarr/YarrJIT.cpp:
(JSC::Yarr::SubjectSampler::SubjectSampler):
(JSC::Yarr::SubjectSampler::frequency const):
(JSC::Yarr::SubjectSampler::sample):
(JSC::Yarr::SubjectSampler::dump const):
(JSC::Yarr::SubjectSampler::is8Bit const):
(JSC::Yarr::SubjectSampler::add):
(JSC::Yarr::BoyerMooreInfo::findBestCharacterSequence const):
(JSC::Yarr::BoyerMooreInfo::findWorthwhileCharacterSequenceForLookahead const):
(JSC::Yarr::jitCompile):
* Source/JavaScriptCore/yarr/YarrJIT.h:

Canonical link: https://commits.webkit.org/260142@main
  • Loading branch information
Constellation committed Feb 11, 2023
1 parent da77e20 commit d43e2a2
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 35 deletions.
10 changes: 6 additions & 4 deletions Source/JavaScriptCore/runtime/RegExp.cpp
Expand Up @@ -227,7 +227,7 @@ void RegExp::byteCodeCompileIfNecessary(VM* vm)
}
}

void RegExp::compile(VM* vm, Yarr::CharSize charSize)
void RegExp::compile(VM* vm, Yarr::CharSize charSize, std::optional<StringView> sampleString)
{
Locker locker { cellLock() };

Expand All @@ -252,14 +252,15 @@ void RegExp::compile(VM* vm, Yarr::CharSize charSize)
&& !pattern.m_containsLookbehinds
) {
auto& jitCode = ensureRegExpJITCode();
Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::JITCompileMode::IncludeSubpatterns);
Yarr::jitCompile(pattern, m_patternString, charSize, sampleString, vm, jitCode, Yarr::JITCompileMode::IncludeSubpatterns);
if (!jitCode.failureReason()) {
m_state = JITCode;
return;
}
}
#else
UNUSED_PARAM(charSize);
UNUSED_PARAM(sampleString);
#endif

dataLogLnIf(Options::dumpCompiledRegExpPatterns(), "Can't JIT this regular expression: \"/", m_patternString, "/\"");
Expand Down Expand Up @@ -291,7 +292,7 @@ bool RegExp::matchConcurrently(
return true;
}

void RegExp::compileMatchOnly(VM* vm, Yarr::CharSize charSize)
void RegExp::compileMatchOnly(VM* vm, Yarr::CharSize charSize, std::optional<StringView> sampleString)
{
Locker locker { cellLock() };

Expand All @@ -316,14 +317,15 @@ void RegExp::compileMatchOnly(VM* vm, Yarr::CharSize charSize)
&& !pattern.m_containsLookbehinds
) {
auto& jitCode = ensureRegExpJITCode();
Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::JITCompileMode::MatchOnly);
Yarr::jitCompile(pattern, m_patternString, charSize, sampleString, vm, jitCode, Yarr::JITCompileMode::MatchOnly);
if (!jitCode.failureReason()) {
m_state = JITCode;
return;
}
}
#else
UNUSED_PARAM(charSize);
UNUSED_PARAM(sampleString);
#endif

dataLogLnIf(Options::dumpCompiledRegExpPatterns(), "Can't JIT this regular expression: \"/", m_patternString, "/\"");
Expand Down
8 changes: 4 additions & 4 deletions Source/JavaScriptCore/runtime/RegExp.h
Expand Up @@ -164,11 +164,11 @@ class RegExp final : public JSCell {

void byteCodeCompileIfNecessary(VM*);

void compile(VM*, Yarr::CharSize);
void compileIfNecessary(VM&, Yarr::CharSize);
void compile(VM*, Yarr::CharSize, std::optional<StringView> sampleString);
void compileIfNecessary(VM&, Yarr::CharSize, std::optional<StringView> sampleString);

void compileMatchOnly(VM*, Yarr::CharSize);
void compileIfNecessaryMatchOnly(VM&, Yarr::CharSize);
void compileMatchOnly(VM*, Yarr::CharSize, std::optional<StringView> sampleString);
void compileIfNecessaryMatchOnly(VM&, Yarr::CharSize, std::optional<StringView> sampleString);

#if ENABLE(YARR_JIT_DEBUG)
void matchCompareWithInterpreter(const String&, int startOffset, int* offsetVector, int jitResult);
Expand Down
12 changes: 6 additions & 6 deletions Source/JavaScriptCore/runtime/RegExpInlines.h
Expand Up @@ -87,15 +87,15 @@ ALWAYS_INLINE bool RegExp::hasCodeFor(Yarr::CharSize charSize)
return false;
}

ALWAYS_INLINE void RegExp::compileIfNecessary(VM& vm, Yarr::CharSize charSize)
ALWAYS_INLINE void RegExp::compileIfNecessary(VM& vm, Yarr::CharSize charSize, std::optional<StringView> sampleString)
{
if (hasCodeFor(charSize))
return;

if (m_state == ParseError)
return;

compile(&vm, charSize);
compile(&vm, charSize, sampleString);
}

template<typename VectorType, Yarr::MatchFrom matchFrom>
Expand All @@ -106,7 +106,7 @@ ALWAYS_INLINE int RegExp::matchInline(JSGlobalObject* nullOrGlobalObject, VM& vm
m_rtMatchTotalSubjectStringLen += (double)(s.length() - startOffset);
#endif

compileIfNecessary(vm, s.is8Bit() ? Yarr::CharSize::Char8 : Yarr::CharSize::Char16);
compileIfNecessary(vm, s.is8Bit() ? Yarr::CharSize::Char8 : Yarr::CharSize::Char16, s);

auto throwError = [&] {
if (matchFrom == Yarr::MatchFrom::CompilerThread)
Expand Down Expand Up @@ -227,15 +227,15 @@ ALWAYS_INLINE bool RegExp::hasMatchOnlyCodeFor(Yarr::CharSize charSize)
return false;
}

ALWAYS_INLINE void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::CharSize charSize)
ALWAYS_INLINE void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::CharSize charSize, std::optional<StringView> sampleString)
{
if (hasMatchOnlyCodeFor(charSize))
return;

if (m_state == ParseError)
return;

compileMatchOnly(&vm, charSize);
compileMatchOnly(&vm, charSize, sampleString);
}

template<Yarr::MatchFrom matchFrom>
Expand All @@ -246,7 +246,7 @@ ALWAYS_INLINE MatchResult RegExp::matchInline(JSGlobalObject* nullOrGlobalObject
m_rtMatchOnlyTotalSubjectStringLen += (double)(s.length() - startOffset);
#endif

compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::CharSize::Char8 : Yarr::CharSize::Char16);
compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::CharSize::Char8 : Yarr::CharSize::Char16, s);

auto throwError = [&] {
if (matchFrom == Yarr::MatchFrom::CompilerThread)
Expand Down
104 changes: 84 additions & 20 deletions Source/JavaScriptCore/yarr/YarrJIT.cpp
Expand Up @@ -53,6 +53,60 @@ static constexpr bool verbose = false;
JSC_ANNOTATE_JIT_OPERATION_RETURN(vmEntryToYarrJITAfter);
#endif

// We should pick the less frequently appearing character as a BM search's anchor to make BM search more and more efficient.
// This class takes some samples from the passed subject string to put weight on characters so that we can pick an optimal one adaptively.
class SubjectSampler {
public:
static constexpr unsigned sampleSize = 128;

explicit SubjectSampler(CharSize charSize)
: m_is8Bit(charSize == CharSize::Char8)
{
}

int32_t frequency(UChar character) const
{
if (!m_size)
return 1;
return static_cast<int32_t>(m_samples[character & BoyerMooreBitmap::mapMask]) * sampleSize / m_size;
}

void sample(StringView string)
{
unsigned half = string.length() > sampleSize ? (string.length() - sampleSize) / 2 : 0;
unsigned end = std::min(string.length(), half + sampleSize);
if (string.is8Bit()) {
auto* characters8 = string.characters8();
for (unsigned i = half; i < end; ++i)
add(characters8[i]);
} else {
auto* characters16 = string.characters16();
for (unsigned i = half; i < end; ++i)
add(characters16[i]);
}
}

void dump() const
{
dataLogLn("Sampling Results size:(", m_size, ")");
for (unsigned i = 0; i < BoyerMooreBitmap::mapSize; ++i)
dataLogLn(" [", makeString(pad(' ', 3, i)), "] ", m_samples[i]);
}

bool is8Bit() const { return m_is8Bit; }

private:
inline void add(UChar character)
{
++m_size;
++m_samples[character & BoyerMooreBitmap::mapMask];
}

std::array<uint8_t, BoyerMooreBitmap::mapSize> m_samples { };
uint8_t m_size { };
bool m_is8Bit { true };
};

void BoyerMooreFastCandidates::dump(PrintStream& out) const
{
if (!isValid()) {
Expand Down Expand Up @@ -107,21 +161,21 @@ class BoyerMooreInfo {
return makeUniqueRef<BoyerMooreInfo>(charSize, length);
}

std::optional<std::tuple<unsigned, unsigned>> findWorthwhileCharacterSequenceForLookahead() const;
std::optional<std::tuple<unsigned, unsigned>> findWorthwhileCharacterSequenceForLookahead(const SubjectSampler&) const;
std::tuple<BoyerMooreBitmap::Map, BoyerMooreFastCandidates> createCandidateBitmap(unsigned begin, unsigned end) const;

void dump(PrintStream&) const;

private:
std::tuple<int, unsigned, unsigned> findBestCharacterSequence(unsigned numberOfCandidatesLimit) const;
std::tuple<int32_t, unsigned, unsigned> findBestCharacterSequence(const SubjectSampler&, unsigned numberOfCandidatesLimit) const;

Vector<BoyerMooreBitmap> m_characters;
CharSize m_charSize;
};

std::tuple<int, unsigned, unsigned> BoyerMooreInfo::findBestCharacterSequence(unsigned numberOfCandidatesLimit) const
std::tuple<int32_t, unsigned, unsigned> BoyerMooreInfo::findBestCharacterSequence(const SubjectSampler& sampler, unsigned numberOfCandidatesLimit) const
{
int biggestPoint = 0;
int32_t biggestPoint = INT32_MIN;
unsigned beginResult = 0;
unsigned endResult = 0;
for (unsigned index = 0; index < length();) {
Expand All @@ -134,13 +188,14 @@ std::tuple<int, unsigned, unsigned> BoyerMooreInfo::findBestCharacterSequence(un
for (; index < length() && m_characters[index].count() <= numberOfCandidatesLimit; ++index)
map.merge(m_characters[index].map());

// If map has many candidates, then point of this sequence is low since it will match too many things.
// And if the sequence is longer, then the point of this sequence is higher since it can skip many characters.
// FIXME: Currently we are handling all characters equally. But we should have weight per character since e.g. 'e' should appear more frequently than '\v'.
// https://bugs.webkit.org/show_bug.cgi?id=228610
int frequency = map.count();
int matchingProbability = BoyerMooreBitmap::mapSize - frequency;
int point = (index - begin) * matchingProbability;
int32_t frequency = 0;
map.forEachSetBit([&](unsigned index) {
frequency += sampler.frequency(index);
});

// Cutoff at 50%. If we could encounter the character more than 50%, then BM search would be useless probably.
int32_t matchingProbability = (BoyerMooreBitmap::mapSize / 2) - frequency;
int32_t point = (index - begin) * matchingProbability;
if (point > biggestPoint) {
biggestPoint = point;
beginResult = begin;
Expand All @@ -150,26 +205,26 @@ std::tuple<int, unsigned, unsigned> BoyerMooreInfo::findBestCharacterSequence(un
return std::tuple { biggestPoint, beginResult, endResult };
}

std::optional<std::tuple<unsigned, unsigned>> BoyerMooreInfo::findWorthwhileCharacterSequenceForLookahead() const
std::optional<std::tuple<unsigned, unsigned>> BoyerMooreInfo::findWorthwhileCharacterSequenceForLookahead(const SubjectSampler& sampler) const
{
// If candiates-per-character becomes larger, then sequence is not profitable since this sequence will match against
// too many characters. But if we limit candiates-per-character smaller, it is possible that we only find very short
// character sequence. We start with low limit, then enlarging the limit to find more and more profitable
// character sequence.
int biggestPoint = 0;
int32_t biggestPoint = INT32_MIN;
unsigned begin = 0;
unsigned end = 0;
constexpr unsigned maxCandidatesPerCharacter = 32;
static_assert(maxCandidatesPerCharacter < BoyerMooreBitmap::mapSize);
for (unsigned limit = 4; limit < maxCandidatesPerCharacter; limit *= 2) {
auto [newPoint, newBegin, newEnd] = findBestCharacterSequence(limit);
auto [newPoint, newBegin, newEnd] = findBestCharacterSequence(sampler, limit);
if (newPoint > biggestPoint) {
biggestPoint = newPoint;
begin = newBegin;
end = newEnd;
}
}
if (!biggestPoint)
if (biggestPoint < 0)
return std::nullopt;
return std::tuple { begin, end };
}
Expand Down Expand Up @@ -2355,7 +2410,7 @@ class YarrGenerator final : public YarrJITInfo {

// Emit fast skip path with stride if we have BoyerMooreInfo.
if (op.m_bmInfo) {
auto range = op.m_bmInfo->findWorthwhileCharacterSequenceForLookahead();
auto range = op.m_bmInfo->findWorthwhileCharacterSequenceForLookahead(m_sampler);
if (range) {
auto [beginIndex, endIndex] = *range;
ASSERT(endIndex <= alternative->m_minimumSize);
Expand Down Expand Up @@ -2442,7 +2497,8 @@ class YarrGenerator final : public YarrJITInfo {
} else
setMatchStart(m_regs.index);
}
}
} else
dataLogLnIf(Options::verboseRegExpCompilation(), "BM search candidates were not efficient enough. Not using BM search");
}
break;
}
Expand Down Expand Up @@ -3839,6 +3895,8 @@ class YarrGenerator final : public YarrJITInfo {
m_ops.last().m_bmInfo = bmInfo.ptr();
m_bmInfos.append(WTFMove(bmInfo));
m_usesT2 = true;
if (m_sampleString)
m_sampler.sample(m_sampleString.value());
} else
dataLogLnIf(YarrJITInternal::verbose, "BM collection failed");
}
Expand Down Expand Up @@ -4220,7 +4278,7 @@ class YarrGenerator final : public YarrJITInfo {
}

public:
YarrGenerator(CCallHelpers& jit, const VM* vm, YarrCodeBlock* codeBlock, const YarrJITRegs& regs, YarrPattern& pattern, StringView patternString, CharSize charSize, JITCompileMode compileMode)
YarrGenerator(CCallHelpers& jit, const VM* vm, YarrCodeBlock* codeBlock, const YarrJITRegs& regs, YarrPattern& pattern, StringView patternString, CharSize charSize, JITCompileMode compileMode, std::optional<StringView> sampleString)
: m_jit(jit)
, m_vm(vm)
, m_codeBlock(codeBlock)
Expand All @@ -4236,6 +4294,8 @@ class YarrGenerator final : public YarrJITInfo {
#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS)
, m_parenContextSizes(compileMode == JITCompileMode::IncludeSubpatterns ? m_pattern.m_numSubpatterns : 0, m_pattern.m_body->m_callFrameSize)
#endif
, m_sampleString(sampleString)
, m_sampler(charSize)
{
}

Expand All @@ -4255,6 +4315,7 @@ class YarrGenerator final : public YarrJITInfo {
#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS)
, m_parenContextSizes(compileMode == JITCompileMode::IncludeSubpatterns ? m_pattern.m_numSubpatterns : 0, m_pattern.m_body->m_callFrameSize)
#endif
, m_sampler(charSize)
{
if (m_pattern.m_containsBackreferences)
m_usesT2 = true;
Expand Down Expand Up @@ -4824,6 +4885,9 @@ class YarrGenerator final : public YarrJITInfo {
// offset in the stack if there wasn't enough registers to pass it, e.g.,
// ARMv7 and MIPS only use 4 registers to pass function arguments.
unsigned m_pushCountInEnter { 0 };

std::optional<StringView> m_sampleString;
SubjectSampler m_sampler;
};

static void dumpCompileFailure(JITFailureReason failure)
Expand Down Expand Up @@ -4862,14 +4926,14 @@ static void dumpCompileFailure(JITFailureReason failure)
}
}

void jitCompile(YarrPattern& pattern, StringView patternString, CharSize charSize, VM* vm, YarrCodeBlock& codeBlock, JITCompileMode mode)
void jitCompile(YarrPattern& pattern, StringView patternString, CharSize charSize, std::optional<StringView> sampleString, VM* vm, YarrCodeBlock& codeBlock, JITCompileMode mode)
{
CCallHelpers masm;

ASSERT(mode == JITCompileMode::MatchOnly || mode == JITCompileMode::IncludeSubpatterns);

YarrJITDefaultRegisters jitRegisters;
YarrGenerator<YarrJITDefaultRegisters>(masm, vm, &codeBlock, jitRegisters, pattern, patternString, charSize, mode).compile(codeBlock);
YarrGenerator<YarrJITDefaultRegisters>(masm, vm, &codeBlock, jitRegisters, pattern, patternString, charSize, mode, sampleString).compile(codeBlock);

if (auto failureReason = codeBlock.failureReason()) {
if (UNLIKELY(Options::dumpCompiledRegExpPatterns())) {
Expand Down
2 changes: 1 addition & 1 deletion Source/JavaScriptCore/yarr/YarrJIT.h
Expand Up @@ -437,7 +437,7 @@ enum class JITCompileMode : uint8_t {
IncludeSubpatterns,
InlineTest
};
void jitCompile(YarrPattern&, StringView patternString, CharSize, VM*, YarrCodeBlock& jitObject, JITCompileMode);
void jitCompile(YarrPattern&, StringView patternString, CharSize, std::optional<StringView> sampleString, VM*, YarrCodeBlock& jitObject, JITCompileMode);

#if ENABLE(YARR_JIT_REGEXP_TEST_INLINE)

Expand Down

0 comments on commit d43e2a2

Please sign in to comment.