From 023c54054092dc68c5df3b230ed3137cbd753b16 Mon Sep 17 00:00:00 2001 From: Kohei Ueno Date: Sat, 4 Nov 2023 18:52:16 -0800 Subject: [PATCH] Support all of HTML's character entities in WebVTT https://bugs.webkit.org/show_bug.cgi?id=176225 Reviewed by Darin Adler. WebVTT cue text tokenizer algorithm has been updated to support all of HTML's character entities in https://github.com/w3c/webvtt/pull/253. This patch updates the tokenizer to align with the latest spec. Spec: https://w3c.github.io/webvtt/#webvtt-cue-text-tokenizer The old `EscapeState` state for handling escape characters has been removed in favor of new two states. And the `WEBVTT_SWITCH_TO` operation has been added for the purpose of enabling state transition without the requirement to advance the input position. * LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt: * Source/WebCore/html/track/WebVTTTokenizer.cpp: (WebCore::ProcessEntity): (WebCore::WebVTTTokenizer::nextToken): Canonical link: https://commits.webkit.org/270240@main --- .../tests/entities-expected.txt | 180 ++---------------- Source/WebCore/html/track/WebVTTTokenizer.cpp | 75 +++----- 2 files changed, 43 insertions(+), 212 deletions(-) diff --git a/LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt b/LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt index 67981bb5ce20..3bb2de809018 100644 --- a/LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt +++ b/LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt @@ -1,183 +1,27 @@ PASS WebVTT cue data parser test entities - 3686fc0cdc60dc536e75df054b0bd372273db2cc -FAIL WebVTT cue data parser test entities - f1869f6e2853635eec81cc3afa3e2b8148ccbdc0 assert_equals: expected "#document-fragment\n| \"&\"" but got "#document-fragment\n| \"&\"" +PASS WebVTT cue data parser test entities - f1869f6e2853635eec81cc3afa3e2b8148ccbdc0 PASS WebVTT cue data parser test entities - 92d76530d723b6b4e4ef8280c01cf1c80f9bebdb -FAIL WebVTT cue data parser test entities - 261cd4e9df4a12535b66a0c39e9635aab2bb19aa assert_equals: expected "#document-fragment\n| \"&\"" but got "#document-fragment\n| \"&\"" +PASS WebVTT cue data parser test entities - 261cd4e9df4a12535b66a0c39e9635aab2bb19aa PASS WebVTT cue data parser test entities - 1a2269cdb73bf97ec6a99b0edabfe646c471b67e PASS WebVTT cue data parser test entities - 44ceb90884cceeeccb4f7024e3598f7dc5ceebfa PASS WebVTT cue data parser test entities - 05def72af03fc2b1617da950d871b9fd0ba20e5a PASS WebVTT cue data parser test entities - da999a55445eca43aa41e039ec439c1a812db297 -FAIL WebVTT cue data parser test entities - 0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb assert_equals: expected "#document-fragment\n| \"\"\"" but got "#document-fragment\n| \""\"" +PASS WebVTT cue data parser test entities - 0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb PASS WebVTT cue data parser test entities - e7387003fbacb22b706796c98b781eb4ebf5ff85 -FAIL WebVTT cue data parser test entities - 216cd0e914b9f2ccd04eff6d02a0b1ce24441d95 assert_equals: expected "#document-fragment\n| \"©\"" but got "#document-fragment\n| \"©\"" +PASS WebVTT cue data parser test entities - 216cd0e914b9f2ccd04eff6d02a0b1ce24441d95 PASS WebVTT cue data parser test entities - 2cdf20980d17a5d077299215e6a7e97f3c6b07e2 PASS WebVTT cue data parser test entities - 83f4500c0bd8598480713997a041d8f70fd3f11e PASS WebVTT cue data parser test entities - 2c6b2ba38a08eca45370f28a5b7df2aa463fb3dc PASS WebVTT cue data parser test entities - f4bb977c0a06851bdd19260c035a766c5c8ea093 PASS WebVTT cue data parser test entities - b1fff1ac42688d16e00f6c758d84e5152e39702d -FAIL WebVTT cue data parser test entities - bd68f6beda2c2264e61dff7359c1ad48bc0a9934 assert_equals: expected "#document-fragment\n| \" \"" but got "#document-fragment\n| \" \"" -FAIL WebVTT cue data parser test entities - 5b77a0be23453dfe6eea59d43bb0708f89e1df82 assert_equals: expected "#document-fragment\n| \" \"" but got "#document-fragment\n| \" \"" +PASS WebVTT cue data parser test entities - bd68f6beda2c2264e61dff7359c1ad48bc0a9934 +PASS WebVTT cue data parser test entities - 5b77a0be23453dfe6eea59d43bb0708f89e1df82 PASS WebVTT cue data parser test entities - 87986551b0e6180cb279f2aa4cdddf77daa90c11 -FAIL WebVTT cue data parser test entities - e3ac2060b915f0f499b2863f999dcdb38a5db79b assert_equals: expected "#document-fragment\n| \"∲\"" but got "#document-fragment\n| \"∲\"" -FAIL WebVTT cue data parser test entities - 31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af assert_equals: expected "#document-fragment\n| \"⫅̸\"" but got "#document-fragment\n| \"⫅̸\"" -FAIL WebVTT cue data parser test entities - 9ed59950764468c4ef2948d71cf75c3f2b60c74d assert_equals: expected "#document-fragment\n| \"∉\"" but got "#document-fragment\n| \"∉\"" -FAIL WebVTT cue data parser test entities - 71a6efcfab81264fb95bb3234c59687c11c72baf assert_equals: expected "#document-fragment\n| \"¬\"" but got "#document-fragment\n| \"¬\"" -FAIL WebVTT cue data parser test entities - 86d7c20ca3c060f9e699c7da43927c4a07a5d569 assert_equals: expected "#document-fragment\n| \"¬\"" but got "#document-fragment\n| \"¬\"" -FAIL WebVTT cue data parser test entities - 314cd94292df37044e90ce27b5606bf8ec636b94 assert_equals: expected "#document-fragment\n| \"¬it;\"" but got "#document-fragment\n| \"¬it;\"" -f1869f6e2853635eec81cc3afa3e2b8148ccbdc0 - -Input - -& -Expected - -#document-fragment -| "&" -Actual - -#document-fragment -| "&" -261cd4e9df4a12535b66a0c39e9635aab2bb19aa - -Input - -& -Expected - -#document-fragment -| "&" -Actual - -#document-fragment -| "&" -0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb - -Input - -" -Expected - -#document-fragment -| """ -Actual - -#document-fragment -| """ -216cd0e914b9f2ccd04eff6d02a0b1ce24441d95 - -Input - -© -Expected - -#document-fragment -| "©" -Actual - -#document-fragment -| "©" -bd68f6beda2c2264e61dff7359c1ad48bc0a9934 - -Input - - -Expected - -#document-fragment -| " " -Actual - -#document-fragment -| " " -5b77a0be23453dfe6eea59d43bb0708f89e1df82 - -Input - - -Expected - -#document-fragment -| " " -Actual - -#document-fragment -| " " -e3ac2060b915f0f499b2863f999dcdb38a5db79b - -Input - -∲ -Expected - -#document-fragment -| "∲" -Actual - -#document-fragment -| "∲" -31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af - -Input - -⫅̸ -Expected - -#document-fragment -| "⫅̸" -Actual - -#document-fragment -| "⫅̸" -9ed59950764468c4ef2948d71cf75c3f2b60c74d - -Input - -∉ -Expected - -#document-fragment -| "∉" -Actual - -#document-fragment -| "∉" -71a6efcfab81264fb95bb3234c59687c11c72baf - -Input - -¬ -Expected - -#document-fragment -| "¬" -Actual - -#document-fragment -| "¬" -86d7c20ca3c060f9e699c7da43927c4a07a5d569 - -Input - -¬ -Expected - -#document-fragment -| "¬" -Actual - -#document-fragment -| "¬" -314cd94292df37044e90ce27b5606bf8ec636b94 - -Input - -¬it; -Expected - -#document-fragment -| "¬it;" -Actual - -#document-fragment -| "¬it;" +PASS WebVTT cue data parser test entities - e3ac2060b915f0f499b2863f999dcdb38a5db79b +PASS WebVTT cue data parser test entities - 31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af +PASS WebVTT cue data parser test entities - 9ed59950764468c4ef2948d71cf75c3f2b60c74d +PASS WebVTT cue data parser test entities - 71a6efcfab81264fb95bb3234c59687c11c72baf +PASS WebVTT cue data parser test entities - 86d7c20ca3c060f9e699c7da43927c4a07a5d569 +PASS WebVTT cue data parser test entities - 314cd94292df37044e90ce27b5606bf8ec636b94 diff --git a/Source/WebCore/html/track/WebVTTTokenizer.cpp b/Source/WebCore/html/track/WebVTTTokenizer.cpp index aa6157cbb3cd..f19f522bbd53 100644 --- a/Source/WebCore/html/track/WebVTTTokenizer.cpp +++ b/Source/WebCore/html/track/WebVTTTokenizer.cpp @@ -34,6 +34,7 @@ #if ENABLE(VIDEO) +#include "HTMLEntityParser.h" #include "MarkupTokenizerInlines.h" #include #include @@ -47,6 +48,13 @@ namespace WebCore { character = m_preprocessor.nextInputCharacter(); \ goto stateName; \ } while (false) +#define WEBVTT_SWITCH_TO(stateName) \ + do { \ + ASSERT(!m_input.isEmpty()); \ + m_preprocessor.peek(m_input); \ + character = m_preprocessor.nextInputCharacter(); \ + goto stateName; \ + } while (false) template ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount]) { @@ -82,6 +90,17 @@ WebVTTTokenizer::WebVTTTokenizer(const String& input) m_input.close(); } +static void ProcessEntity(SegmentedString& source, StringBuilder& result, UChar additionalAllowedCharacter = 0) +{ + auto decoded = consumeHTMLEntity(source, additionalAllowedCharacter); + if (decoded.failed() || decoded.notEnoughCharacters()) + result.append('&'); + else { + for (auto character : decoded.span()) + result.append(character); + } +} + bool WebVTTTokenizer::nextToken(WebVTTToken& token) { if (m_input.isEmpty() || !m_preprocessor.peek(m_input)) @@ -100,8 +119,7 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token) // 4.8.10.13.4 WebVTT cue text tokenizer DataState: if (character == '&') { - buffer.append('&'); - WEBVTT_ADVANCE_TO(EscapeState); + WEBVTT_ADVANCE_TO(HTMLCharacterReferenceInDataState); } else if (character == '<') { if (result.isEmpty()) WEBVTT_ADVANCE_TO(TagState); @@ -117,47 +135,6 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token) WEBVTT_ADVANCE_TO(DataState); } -EscapeState: - if (character == ';') { - if (equalLiteral(buffer, "&")) - result.append('&'); - else if (equalLiteral(buffer, "<")) - result.append('<'); - else if (equalLiteral(buffer, ">")) - result.append('>'); - else if (equalLiteral(buffer, "&lrm")) - result.append(leftToRightMark); - else if (equalLiteral(buffer, "&rlm")) - result.append(rightToLeftMark); - else if (equalLiteral(buffer, " ")) - result.append(noBreakSpace); - else { - buffer.append(character); - result.append(buffer); - } - buffer.clear(); - WEBVTT_ADVANCE_TO(DataState); - } else if (isASCIIAlphanumeric(character)) { - buffer.append(character); - WEBVTT_ADVANCE_TO(EscapeState); - } else if (character == '<') { - result.append(buffer); - return emitToken(token, WebVTTToken::StringToken(result.toString())); - } else if (character == kEndOfFileMarker) { - result.append(buffer); - return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString())); - } else { - result.append(buffer); - buffer.clear(); - - if (character == '&') { - buffer.append('&'); - WEBVTT_ADVANCE_TO(EscapeState); - } - result.append(character); - WEBVTT_ADVANCE_TO(DataState); - } - TagState: if (isTokenizerWhitespace(character)) { ASSERT(result.isEmpty()); @@ -209,7 +186,9 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token) } StartTagAnnotationState: - if (character == '>' || character == kEndOfFileMarker) + if (character == '&') + WEBVTT_ADVANCE_TO(HTMLCharacterReferenceInAnnotationState); + else if (character == '>' || character == kEndOfFileMarker) return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomString(), buffer.toAtomString())); buffer.append(character); WEBVTT_ADVANCE_TO(StartTagAnnotationState); @@ -225,6 +204,14 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token) return advanceAndEmitToken(m_input, token, WebVTTToken::TimestampTag(result.toString())); result.append(character); WEBVTT_ADVANCE_TO(TimestampTagState); + +HTMLCharacterReferenceInDataState: + ProcessEntity(m_input, result); + WEBVTT_SWITCH_TO(DataState); + +HTMLCharacterReferenceInAnnotationState: + ProcessEntity(m_input, result, '>'); + WEBVTT_SWITCH_TO(StartTagAnnotationState); } }