Skip to content

Commit

Permalink
Support all of HTML's character entities in WebVTT
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=176225

Reviewed by Darin Adler.

WebVTT cue text tokenizer algorithm has been updated to support
all of HTML's character entities in w3c/webvtt#253.
This patch updates the tokenizer to align with the latest spec.
Spec: https://w3c.github.io/webvtt/#webvtt-cue-text-tokenizer

The old `EscapeState` state for handling escape characters has been
removed in favor of new two states. And the `WEBVTT_SWITCH_TO` operation
has been added for the purpose of enabling state transition without
the requirement to advance the input position.

* LayoutTests/imported/w3c/web-platform-tests/webvtt/parsing/cue-text-parsing/tests/entities-expected.txt:
* Source/WebCore/html/track/WebVTTTokenizer.cpp:
(WebCore::ProcessEntity):
(WebCore::WebVTTTokenizer::nextToken):

Canonical link: https://commits.webkit.org/270240@main
  • Loading branch information
cola119 authored and Ahmad Saleem committed Nov 5, 2023
1 parent 6e5d98e commit 023c540
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 212 deletions.
@@ -1,183 +1,27 @@

PASS WebVTT cue data parser test entities - 3686fc0cdc60dc536e75df054b0bd372273db2cc
FAIL WebVTT cue data parser test entities - f1869f6e2853635eec81cc3afa3e2b8148ccbdc0 assert_equals: expected "#document-fragment\n| \"&\"" but got "#document-fragment\n| \"&amp\""
PASS WebVTT cue data parser test entities - f1869f6e2853635eec81cc3afa3e2b8148ccbdc0
PASS WebVTT cue data parser test entities - 92d76530d723b6b4e4ef8280c01cf1c80f9bebdb
FAIL WebVTT cue data parser test entities - 261cd4e9df4a12535b66a0c39e9635aab2bb19aa assert_equals: expected "#document-fragment\n| \"&\"" but got "#document-fragment\n| \"&\""
PASS WebVTT cue data parser test entities - 261cd4e9df4a12535b66a0c39e9635aab2bb19aa
PASS WebVTT cue data parser test entities - 1a2269cdb73bf97ec6a99b0edabfe646c471b67e
PASS WebVTT cue data parser test entities - 44ceb90884cceeeccb4f7024e3598f7dc5ceebfa
PASS WebVTT cue data parser test entities - 05def72af03fc2b1617da950d871b9fd0ba20e5a
PASS WebVTT cue data parser test entities - da999a55445eca43aa41e039ec439c1a812db297
FAIL WebVTT cue data parser test entities - 0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb assert_equals: expected "#document-fragment\n| \"\"\"" but got "#document-fragment\n| \""\""
PASS WebVTT cue data parser test entities - 0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb
PASS WebVTT cue data parser test entities - e7387003fbacb22b706796c98b781eb4ebf5ff85
FAIL WebVTT cue data parser test entities - 216cd0e914b9f2ccd04eff6d02a0b1ce24441d95 assert_equals: expected "#document-fragment\n| \"©\"" but got "#document-fragment\n| \"©\""
PASS WebVTT cue data parser test entities - 216cd0e914b9f2ccd04eff6d02a0b1ce24441d95
PASS WebVTT cue data parser test entities - 2cdf20980d17a5d077299215e6a7e97f3c6b07e2
PASS WebVTT cue data parser test entities - 83f4500c0bd8598480713997a041d8f70fd3f11e
PASS WebVTT cue data parser test entities - 2c6b2ba38a08eca45370f28a5b7df2aa463fb3dc
PASS WebVTT cue data parser test entities - f4bb977c0a06851bdd19260c035a766c5c8ea093
PASS WebVTT cue data parser test entities - b1fff1ac42688d16e00f6c758d84e5152e39702d
FAIL WebVTT cue data parser test entities - bd68f6beda2c2264e61dff7359c1ad48bc0a9934 assert_equals: expected "#document-fragment\n| \" \"" but got "#document-fragment\n| \" \""
FAIL WebVTT cue data parser test entities - 5b77a0be23453dfe6eea59d43bb0708f89e1df82 assert_equals: expected "#document-fragment\n| \" \"" but got "#document-fragment\n| \" \""
PASS WebVTT cue data parser test entities - bd68f6beda2c2264e61dff7359c1ad48bc0a9934
PASS WebVTT cue data parser test entities - 5b77a0be23453dfe6eea59d43bb0708f89e1df82
PASS WebVTT cue data parser test entities - 87986551b0e6180cb279f2aa4cdddf77daa90c11
FAIL WebVTT cue data parser test entities - e3ac2060b915f0f499b2863f999dcdb38a5db79b assert_equals: expected "#document-fragment\n| \"∲\"" but got "#document-fragment\n| \"∲\""
FAIL WebVTT cue data parser test entities - 31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af assert_equals: expected "#document-fragment\n| \"⫅̸\"" but got "#document-fragment\n| \"⫅̸\""
FAIL WebVTT cue data parser test entities - 9ed59950764468c4ef2948d71cf75c3f2b60c74d assert_equals: expected "#document-fragment\n| \"∉\"" but got "#document-fragment\n| \"∉\""
FAIL WebVTT cue data parser test entities - 71a6efcfab81264fb95bb3234c59687c11c72baf assert_equals: expected "#document-fragment\n| \"¬\"" but got "#document-fragment\n| \"¬\""
FAIL WebVTT cue data parser test entities - 86d7c20ca3c060f9e699c7da43927c4a07a5d569 assert_equals: expected "#document-fragment\n| \"¬\"" but got "#document-fragment\n| \"&not\""
FAIL WebVTT cue data parser test entities - 314cd94292df37044e90ce27b5606bf8ec636b94 assert_equals: expected "#document-fragment\n| \"¬it;\"" but got "#document-fragment\n| \"&notit;\""
f1869f6e2853635eec81cc3afa3e2b8148ccbdc0

Input

&amp
Expected

#document-fragment
| "&"
Actual

#document-fragment
| "&amp"
261cd4e9df4a12535b66a0c39e9635aab2bb19aa

Input

&
Expected

#document-fragment
| "&"
Actual

#document-fragment
| "&"
0fd9e3823b62c028c1d50e35b1f3ee3df02a62eb

Input

"
Expected

#document-fragment
| """
Actual

#document-fragment
| """
216cd0e914b9f2ccd04eff6d02a0b1ce24441d95

Input

©
Expected

#document-fragment
| "©"
Actual

#document-fragment
| "©"
bd68f6beda2c2264e61dff7359c1ad48bc0a9934

Input

 
Expected

#document-fragment
| " "
Actual

#document-fragment
| " "
5b77a0be23453dfe6eea59d43bb0708f89e1df82

Input

 
Expected

#document-fragment
| " "
Actual

#document-fragment
| " "
e3ac2060b915f0f499b2863f999dcdb38a5db79b

Input

∲
Expected

#document-fragment
| "∲"
Actual

#document-fragment
| "∲"
31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af

Input

⫅̸
Expected

#document-fragment
| "⫅̸"
Actual

#document-fragment
| "⫅̸"
9ed59950764468c4ef2948d71cf75c3f2b60c74d

Input

∉
Expected

#document-fragment
| "∉"
Actual

#document-fragment
| "∉"
71a6efcfab81264fb95bb3234c59687c11c72baf

Input

¬
Expected

#document-fragment
| "¬"
Actual

#document-fragment
| "¬"
86d7c20ca3c060f9e699c7da43927c4a07a5d569

Input

&not
Expected

#document-fragment
| "¬"
Actual

#document-fragment
| "&not"
314cd94292df37044e90ce27b5606bf8ec636b94

Input

&notit;
Expected

#document-fragment
| "¬it;"
Actual

#document-fragment
| "&notit;"
PASS WebVTT cue data parser test entities - e3ac2060b915f0f499b2863f999dcdb38a5db79b
PASS WebVTT cue data parser test entities - 31c8a5ecfa5c54d8c0ec5b4ee8f0bbea0d6d40af
PASS WebVTT cue data parser test entities - 9ed59950764468c4ef2948d71cf75c3f2b60c74d
PASS WebVTT cue data parser test entities - 71a6efcfab81264fb95bb3234c59687c11c72baf
PASS WebVTT cue data parser test entities - 86d7c20ca3c060f9e699c7da43927c4a07a5d569
PASS WebVTT cue data parser test entities - 314cd94292df37044e90ce27b5606bf8ec636b94

75 changes: 31 additions & 44 deletions Source/WebCore/html/track/WebVTTTokenizer.cpp
Expand Up @@ -34,6 +34,7 @@

#if ENABLE(VIDEO)

#include "HTMLEntityParser.h"
#include "MarkupTokenizerInlines.h"
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
Expand All @@ -47,6 +48,13 @@ namespace WebCore {
character = m_preprocessor.nextInputCharacter(); \
goto stateName; \
} while (false)
#define WEBVTT_SWITCH_TO(stateName) \
do { \
ASSERT(!m_input.isEmpty()); \
m_preprocessor.peek(m_input); \
character = m_preprocessor.nextInputCharacter(); \
goto stateName; \
} while (false)

template<unsigned charactersCount> ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount])
{
Expand Down Expand Up @@ -82,6 +90,17 @@ WebVTTTokenizer::WebVTTTokenizer(const String& input)
m_input.close();
}

static void ProcessEntity(SegmentedString& source, StringBuilder& result, UChar additionalAllowedCharacter = 0)
{
auto decoded = consumeHTMLEntity(source, additionalAllowedCharacter);
if (decoded.failed() || decoded.notEnoughCharacters())
result.append('&');
else {
for (auto character : decoded.span())
result.append(character);
}
}

bool WebVTTTokenizer::nextToken(WebVTTToken& token)
{
if (m_input.isEmpty() || !m_preprocessor.peek(m_input))
Expand All @@ -100,8 +119,7 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token)
// 4.8.10.13.4 WebVTT cue text tokenizer
DataState:
if (character == '&') {
buffer.append('&');
WEBVTT_ADVANCE_TO(EscapeState);
WEBVTT_ADVANCE_TO(HTMLCharacterReferenceInDataState);
} else if (character == '<') {
if (result.isEmpty())
WEBVTT_ADVANCE_TO(TagState);
Expand All @@ -117,47 +135,6 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token)
WEBVTT_ADVANCE_TO(DataState);
}

EscapeState:
if (character == ';') {
if (equalLiteral(buffer, "&amp"))
result.append('&');
else if (equalLiteral(buffer, "&lt"))
result.append('<');
else if (equalLiteral(buffer, "&gt"))
result.append('>');
else if (equalLiteral(buffer, "&lrm"))
result.append(leftToRightMark);
else if (equalLiteral(buffer, "&rlm"))
result.append(rightToLeftMark);
else if (equalLiteral(buffer, "&nbsp"))
result.append(noBreakSpace);
else {
buffer.append(character);
result.append(buffer);
}
buffer.clear();
WEBVTT_ADVANCE_TO(DataState);
} else if (isASCIIAlphanumeric(character)) {
buffer.append(character);
WEBVTT_ADVANCE_TO(EscapeState);
} else if (character == '<') {
result.append(buffer);
return emitToken(token, WebVTTToken::StringToken(result.toString()));
} else if (character == kEndOfFileMarker) {
result.append(buffer);
return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString()));
} else {
result.append(buffer);
buffer.clear();

if (character == '&') {
buffer.append('&');
WEBVTT_ADVANCE_TO(EscapeState);
}
result.append(character);
WEBVTT_ADVANCE_TO(DataState);
}

TagState:
if (isTokenizerWhitespace(character)) {
ASSERT(result.isEmpty());
Expand Down Expand Up @@ -209,7 +186,9 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token)
}

StartTagAnnotationState:
if (character == '>' || character == kEndOfFileMarker)
if (character == '&')
WEBVTT_ADVANCE_TO(HTMLCharacterReferenceInAnnotationState);
else if (character == '>' || character == kEndOfFileMarker)
return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomString(), buffer.toAtomString()));
buffer.append(character);
WEBVTT_ADVANCE_TO(StartTagAnnotationState);
Expand All @@ -225,6 +204,14 @@ bool WebVTTTokenizer::nextToken(WebVTTToken& token)
return advanceAndEmitToken(m_input, token, WebVTTToken::TimestampTag(result.toString()));
result.append(character);
WEBVTT_ADVANCE_TO(TimestampTagState);

HTMLCharacterReferenceInDataState:
ProcessEntity(m_input, result);
WEBVTT_SWITCH_TO(DataState);

HTMLCharacterReferenceInAnnotationState:
ProcessEntity(m_input, result, '>');
WEBVTT_SWITCH_TO(StartTagAnnotationState);
}

}
Expand Down

0 comments on commit 023c540

Please sign in to comment.