Skip to content

Commit

Permalink
Parsing HTML entities shouldn't call malloc
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=119921
rdar://109976279

Reviewed by Chris Dumez.

This was inspired by some work done on Chromium. While I didn't use
the Chromium patch, I did much the same work, taking advantage of the
fact that HTML entity parsing only generates a sequence of 1-3 UTF-16
code points, not arbitrary strings. Also fixed mismatch between the
interface and the needs in the fast path HTML parser.

* Source/WebCore/WebCore.xcodeproj/project.pbxproj: Removed
CharacterReferenceParserInlines.h.

* Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp:
(WebCore::HTMLFastPathParser::scanHTMLCharacterReference): Use consumeHTMLEntity
function that takes a StringParsingBuffer. This elimintes the need for a
temporary SegmentedString, and resolves the FIXME that was here.

* Source/WebCore/html/parser/HTMLEntityParser.cpp:
(WebCore::DecodedHTMLEntity::DecodedHTMLEntity): Added constructors for the
class used for the return type.
(WebCore::makeEntity): Added. Converts a UChar32, Checked<UChar32>, or
HTMLEntityTableEntry into a DecodedHTMLEntity.
(WebCore::SegmentedStringSource): Added. Adapter for SegmentedString so we can
share a single set of parser functions.
(WebCore::StringParsingBufferSource): Added. Adapter for StringParsingBuffer.
(WebCore::consumeDecimalHTMLEntity): Added. Refactored from code formerly
in CharacterReferenceParserInlines.h.
(WebCore::consumeHexHTMLEntity): Ditto.
(WebCore::consumeNamedHTMLEntity): Added. Refactored from code formerly
in HTMLEntityParser::consumeNamedEntity.
(WebCore::consumeHTMLEntity): Added. Refactored from code formerly
in CharacterReferenceParserInlines.h.
(WebCore::decodeNamedHTMLEntityForXMLParser): Renamed from
decodeNamedEntityToUCharArray. We now take a std::array& for safety so it's
no longer necessary to put the data type in the function name.

* Source/WebCore/html/parser/HTMLEntityParser.h: Updated includes.
Added a new DecodedHTMLEntity type for the return value from the parser.
Got rid of out parameters and put the error cases in the return value.
Another alternative would have been std::expected.

* Source/WebCore/html/parser/HTMLTokenizer.cpp:
(WebCore::HTMLTokenizer::processEntity): Updated for changes to consumeHTMLEntity.
(WebCore::HTMLTokenizer::processToken): Ditto.

* Source/WebCore/xml/parser/CharacterReferenceParserInlines.h: Removed.

* Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp:
(WebCore::convertUTF16EntityToUTF8): Updated to use std::span.
(WebCore::getXHTMLEntity): Updated for decodeNamedHTMLEntityForXMLParser.

Canonical link: https://commits.webkit.org/264675@main
  • Loading branch information
darinadler committed May 30, 2023
1 parent 03d330e commit 6298382
Show file tree
Hide file tree
Showing 7 changed files with 303 additions and 298 deletions.
8 changes: 2 additions & 6 deletions Source/WebCore/WebCore.xcodeproj/project.pbxproj
Expand Up @@ -3534,7 +3534,6 @@
97AABD2514FA09D5007457AE /* WebSocketFrame.h in Headers */ = {isa = PBXBuildFile; fileRef = 97AABD0A14FA09D5007457AE /* WebSocketFrame.h */; settings = {ATTRIBUTES = (Private, ); }; };
97AABD2714FA09D5007457AE /* WebSocketHandshake.h in Headers */ = {isa = PBXBuildFile; fileRef = 97AABD0C14FA09D5007457AE /* WebSocketHandshake.h */; settings = {ATTRIBUTES = (Private, ); }; };
97AABD2D14FA09D5007457AE /* WorkerThreadableWebSocketChannel.h in Headers */ = {isa = PBXBuildFile; fileRef = 97AABD1214FA09D5007457AE /* WorkerThreadableWebSocketChannel.h */; settings = {ATTRIBUTES = (Private, ); }; };
97B8FFD116AE7F960038388D /* CharacterReferenceParserInlines.h in Headers */ = {isa = PBXBuildFile; fileRef = 97B8FFCF16AE7F920038388D /* CharacterReferenceParserInlines.h */; };
97BC69DD1505F076001B74AC /* ChangeVersionWrapper.h in Headers */ = {isa = PBXBuildFile; fileRef = 97BC69D91505F076001B74AC /* ChangeVersionWrapper.h */; };
97BC6A211505F081001B74AC /* Database.h in Headers */ = {isa = PBXBuildFile; fileRef = 97BC69DF1505F081001B74AC /* Database.h */; };
97BC6A241505F081001B74AC /* DatabaseAuthorizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 97BC69E21505F081001B74AC /* DatabaseAuthorizer.h */; };
Expand Down Expand Up @@ -14452,7 +14451,6 @@
97AABD0C14FA09D5007457AE /* WebSocketHandshake.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WebSocketHandshake.h; sourceTree = "<group>"; };
97AABD1114FA09D5007457AE /* WorkerThreadableWebSocketChannel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WorkerThreadableWebSocketChannel.cpp; sourceTree = "<group>"; };
97AABD1214FA09D5007457AE /* WorkerThreadableWebSocketChannel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WorkerThreadableWebSocketChannel.h; sourceTree = "<group>"; };
97B8FFCF16AE7F920038388D /* CharacterReferenceParserInlines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CharacterReferenceParserInlines.h; sourceTree = "<group>"; };
97BC69D81505F076001B74AC /* ChangeVersionWrapper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ChangeVersionWrapper.cpp; sourceTree = "<group>"; };
97BC69D91505F076001B74AC /* ChangeVersionWrapper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ChangeVersionWrapper.h; sourceTree = "<group>"; };
97BC69DE1505F081001B74AC /* Database.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Database.cpp; sourceTree = "<group>"; };
Expand Down Expand Up @@ -19697,7 +19695,6 @@
00B9318013BA867F0035A948 /* parser */ = {
isa = PBXGroup;
children = (
97B8FFCF16AE7F920038388D /* CharacterReferenceParserInlines.h */,
00C60E3E13D76D7E0092A275 /* MarkupTokenizerInlines.h */,
00B9318113BA867F0035A948 /* XMLDocumentParser.cpp */,
00B9318213BA867F0035A948 /* XMLDocumentParser.h */,
Expand Down Expand Up @@ -31735,8 +31732,8 @@
BC5EB6670E81CB7100B25965 /* RenderStyleConstants.h */,
930A9D8429DA1FC0006CBF0E /* RenderStyleInlines.h */,
930A9D8B29DA8207006CBF0E /* RenderStyleSetters.h */,
A39F8E222A1ECC1E00967C09 /* ScrollbarGutter.cpp */,
A39F8E212A1ECBFC00967C09 /* ScrollbarGutter.h */,
A39F8E222A1ECC1E00967C09 /* ScrollbarGutter.cpp */,
A39F8E212A1ECBFC00967C09 /* ScrollbarGutter.h */,
BC5EB8C10E82031B00B25965 /* ShadowData.cpp */,
BC5EB8C20E82031B00B25965 /* ShadowData.h */,
1AB5EBCF194A1D170059AC70 /* ShapeValue.cpp */,
Expand Down Expand Up @@ -36365,7 +36362,6 @@
6550B6A0099DF0270090D781 /* CharacterData.h in Headers */,
C5592F781A92AA28001F8862 /* CharacterProperties.h in Headers */,
93F2CC932427FB9C005851D8 /* CharacterRange.h in Headers */,
97B8FFD116AE7F960038388D /* CharacterReferenceParserInlines.h in Headers */,
F55B3DB21251F12D003EF269 /* CheckboxInputType.h in Headers */,
E4C4C61A27452A7900A040E7 /* ChildChangeInvalidation.h in Headers */,
D619A308144E00BE004BC302 /* ChildListMutationScope.h in Headers */,
Expand Down
11 changes: 2 additions & 9 deletions Source/WebCore/html/parser/HTMLDocumentParserFastPath.cpp
Expand Up @@ -648,15 +648,8 @@ class HTMLFastPathParser {
m_parsingBuffer.advance();

if (LIKELY(m_parsingBuffer.lengthRemaining() >= 2)) {
// FIXME: It is unnecessarily inefficient to use a SegmentedString here. Ideally, we'd
// be able to parse HTMLEntities straight from a StringView.
StringView inputString(m_parsingBuffer.position(), m_parsingBuffer.lengthRemaining());
SegmentedString inputSegmented { inputString };
inputSegmented.append(String { &kEndOfFileMarker, 1 }); // Matches what the full parser does in markEndOfFile().
bool notEnoughCharacters = false;
if (consumeHTMLEntity(inputSegmented, out, notEnoughCharacters)) {
// Advance m_parsingBuffer by the amount of characters consumed by consumeHTMLEntity().
m_parsingBuffer.advanceBy(inputString.length() - (inputSegmented.length() - 1));
if (auto entity = consumeHTMLEntity(m_parsingBuffer); !entity.failed()) {
out.append(entity.span());
return;
}
}
Expand Down

0 comments on commit 6298382

Please sign in to comment.