Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
HTML fast parser fails to parse complex HTML entities
https://bugs.webkit.org/show_bug.cgi?id=255302

Reviewed by Ryosuke Niwa.

When trying to parse a non-trivial HTML entity such as `¢`, the fast HTML
parser would call `consumeHTMLEntity()` with the string "cent". This would
always fail `notEnoughCharacters` would be set to true. This is because our
parser currently requires data after the HTML entity to make sure we reached
the end of the entity.

To address the issue, the HTML fast parser now includes the trailing semicolon
when calling `consumeHTMLEntity()`. We now pass the string "cent;" for example.
I also tweaked the HTMLEntityParser to not fail with `notEnoughCharacters` if
the last character was a semicolon. In this case, it is safe to assume the
entity was complete, even though we don't know what comes next in the stream.

* Source/WebCore/Headers.cmake:
* Source/WebCore/WebCore.xcodeproj/project.pbxproj:
* Source/WebCore/dom/DocumentFragment.h:
* Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp:
(WebCore::HTMLFastPathParser::scanHTMLCharacterReference):
* Source/WebCore/html/parser/HTMLDocumentParserFastPath.h:
* Source/WebCore/html/parser/HTMLEntityParser.cpp:
(WebCore::HTMLEntityParser::consumeNamedEntity):
* Tools/TestWebKitAPI/Tests/WebCore/HTMLParserIdioms.cpp:
(TestWebKitAPI::TEST):

Canonical link: https://commits.webkit.org/262856@main
  • Loading branch information
cdumez committed Apr 12, 2023
1 parent 1612e71 commit 2e3b48b
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Source/WebCore/Headers.cmake
Expand Up @@ -1063,9 +1063,10 @@ set(WebCore_PRIVATE_FRAMEWORK_HEADERS

html/forms/FileIconLoader.h

html/parser/ParsingUtilities.h
html/parser/HTMLDocumentParserFastPath.h
html/parser/HTMLParserIdioms.h
html/parser/HTMLParserScriptingFlagPolicy.h
html/parser/ParsingUtilities.h

html/track/AudioTrack.h
html/track/AudioTrackClient.h
Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/WebCore.xcodeproj/project.pbxproj
Expand Up @@ -1524,7 +1524,7 @@
46CA9C441F97BBE9004CFC3A /* VisibilityState.h in Headers */ = {isa = PBXBuildFile; fileRef = 46CA9C411F97BBE7004CFC3A /* VisibilityState.h */; settings = {ATTRIBUTES = (Private, ); }; };
46CF504A27AB1EA40070DBC8 /* SharedWorkerContextManager.h in Headers */ = {isa = PBXBuildFile; fileRef = 46CF504827AB1EA10070DBC8 /* SharedWorkerContextManager.h */; settings = {ATTRIBUTES = (Private, ); }; };
46D0004026A0FEB300D1BF1E /* SubmitEvent.h in Headers */ = {isa = PBXBuildFile; fileRef = 46D0003E26A0FE6F00D1BF1E /* SubmitEvent.h */; };
46D21F192993268400835FBA /* HTMLDocumentParserFastPath.h in Headers */ = {isa = PBXBuildFile; fileRef = 46D21F182993268000835FBA /* HTMLDocumentParserFastPath.h */; };
46D21F192993268400835FBA /* HTMLDocumentParserFastPath.h in Headers */ = {isa = PBXBuildFile; fileRef = 46D21F182993268000835FBA /* HTMLDocumentParserFastPath.h */; settings = {ATTRIBUTES = (Private, ); }; };
46DA4A0227AB039300E375A8 /* TransferredMessagePort.h in Headers */ = {isa = PBXBuildFile; fileRef = 46DA4A0027AB038D00E375A8 /* TransferredMessagePort.h */; settings = {ATTRIBUTES = (Private, ); }; };
46DBB6501AB8C96F00D9A813 /* PowerObserverMac.h in Headers */ = {isa = PBXBuildFile; fileRef = 46DBB64E1AB8C96F00D9A813 /* PowerObserverMac.h */; settings = {ATTRIBUTES = (Private, ); }; };
46DC53BF28EB3D65005376B0 /* ScreenOrientationManager.h in Headers */ = {isa = PBXBuildFile; fileRef = 46DC53BE28EB3D64005376B0 /* ScreenOrientationManager.h */; settings = {ATTRIBUTES = (Private, ); }; };
Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/dom/ContainerNode.h
Expand Up @@ -69,7 +69,7 @@ class ContainerNode : public Node {
void parserRemoveChild(Node&);
void parserInsertBefore(Node& newChild, Node& refChild);

void removeChildren();
WEBCORE_EXPORT void removeChildren();

void takeAllChildrenFrom(ContainerNode*);

Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/dom/DocumentFragment.h
Expand Up @@ -31,7 +31,7 @@ namespace WebCore {
class DocumentFragment : public ContainerNode {
WTF_MAKE_ISO_ALLOCATED(DocumentFragment);
public:
static Ref<DocumentFragment> create(Document&);
WEBCORE_EXPORT static Ref<DocumentFragment> create(Document&);

void parseHTML(const String&, Element& contextElement, OptionSet<ParserContentPolicy> = { ParserContentPolicy::AllowScriptingContent, ParserContentPolicy::AllowPluginContent });
bool parseXML(const String&, Element* contextElement, OptionSet<ParserContentPolicy> = { ParserContentPolicy::AllowScriptingContent, ParserContentPolicy::AllowPluginContent });
Expand Down
12 changes: 6 additions & 6 deletions Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp
Expand Up @@ -696,17 +696,17 @@ class HTMLFastPathParser {
out.append(0xa0);
else {
// This handles uncommon named references.
String inputString { reference.data(), static_cast<unsigned>(reference.size()) };
SegmentedString inputSegmented { inputString };
// We need +1 to include the trailing semicolon. Without it, the HTMLEntityParser would think
// the input is incomplete and would fail parsing.
String inputString { reference.data(), static_cast<unsigned>(reference.size() + 1) };
SegmentedString inputSegmented { WTFMove(inputString) };
bool notEnoughCharacters = false;
if (!consumeHTMLEntity(inputSegmented, out, notEnoughCharacters) || notEnoughCharacters)
return didFail(HTMLFastPathResult::FailedParsingCharacterReference);
// consumeHTMLEntity() may not have consumed all the input.
if (auto remainingLength = inputSegmented.length()) {
if (*(m_parsingBuffer.position() - 1) == ';')
m_parsingBuffer.setPosition(m_parsingBuffer.position() - remainingLength - 1);
else
m_parsingBuffer.setPosition(m_parsingBuffer.position() - remainingLength);
ASSERT(*(m_parsingBuffer.position() - 1) == ';');
m_parsingBuffer.setPosition(m_parsingBuffer.position() - remainingLength);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion Source/WebCore/html/parser/HTMLDocumentParserFastPath.h
Expand Up @@ -41,7 +41,7 @@ class Element;

enum class ParserContentPolicy : uint8_t;

bool tryFastParsingHTMLFragment(const String& source, Document&, DocumentFragment&, Element& contextElement, OptionSet<ParserContentPolicy>);
WEBCORE_EXPORT bool tryFastParsingHTMLFragment(const String& source, Document&, DocumentFragment&, Element& contextElement, OptionSet<ParserContentPolicy>);

} // namespace WebCore

2 changes: 1 addition & 1 deletion Source/WebCore/html/parser/HTMLEntityParser.cpp
Expand Up @@ -79,7 +79,7 @@ class HTMLEntityParser {
consumedCharacters.append(cc);
source.advancePastNonNewline();
}
notEnoughCharacters = source.isEmpty();
notEnoughCharacters = source.isEmpty() && cc != ';';
if (notEnoughCharacters) {
// We can't decide on an entity because there might be a longer entity
// that we could match if we had more data.
Expand Down
37 changes: 37 additions & 0 deletions Tools/TestWebKitAPI/Tests/WebCore/HTMLParserIdioms.cpp
Expand Up @@ -26,14 +26,18 @@
#include "config.h"

#include "Test.h"
#include <WebCore/DocumentFragment.h>
#include <WebCore/FragmentScriptingPermission.h>
#include <WebCore/HTMLBodyElement.h>
#include <WebCore/HTMLDivElement.h>
#include <WebCore/HTMLDocument.h>
#include <WebCore/HTMLDocumentParserFastPath.h>
#include <WebCore/HTMLHtmlElement.h>
#include <WebCore/HTMLInputElement.h>
#include <WebCore/HTMLParserIdioms.h>
#include <WebCore/ProcessWarming.h>
#include <WebCore/Settings.h>
#include <WebCore/Text.h>
#include <wtf/text/WTFString.h>

using namespace WebCore;
Expand Down Expand Up @@ -196,4 +200,37 @@ TEST(WebCoreHTMLParser, HTMLInputElementCheckedState)
EXPECT_TRUE(inputElement->checked());
}

TEST(WebCoreHTMLParser, FastPathComplexHTMLEntityParsing)
{
ProcessWarming::initializeNames();

auto settings = Settings::create(nullptr);
auto document = HTMLDocument::create(nullptr, settings.get(), aboutBlankURL());
auto documentElement = HTMLHtmlElement::create(document);
document->appendChild(documentElement);
auto body = HTMLBodyElement::create(document);
documentElement->appendChild(body);

auto div = HTMLDivElement::create(document);
document->body()->appendChild(div);

auto fragment = DocumentFragment::create(document);
bool result = tryFastParsingHTMLFragment("Price: 12&cent; only"_s, document, fragment, div, { ParserContentPolicy::AllowScriptingContent, ParserContentPolicy::AllowPluginContent });
EXPECT_TRUE(result);

auto textChild = dynamicDowncast<Text>(fragment->firstChild());
ASSERT_TRUE(textChild);

EXPECT_STREQ(textChild->data().utf8().data(), String::fromUTF8("Price: 12¢ only").utf8().data());

fragment->removeChildren();
result = tryFastParsingHTMLFragment("Genius Nicer Dicer Plus | 18&nbsp&hellip;"_s, document, fragment, div, { ParserContentPolicy::AllowScriptingContent, ParserContentPolicy::AllowPluginContent });
EXPECT_TRUE(result);

textChild = dynamicDowncast<Text>(fragment->firstChild());
ASSERT_TRUE(textChild);

EXPECT_STREQ(textChild->data().utf8().data(), String::fromUTF8("Genius Nicer Dicer Plus | 18 …").utf8().data());
}

} // namespace TestWebKitAPI

0 comments on commit 2e3b48b

Please sign in to comment.