diff --git a/bin/_docs_components/html.md b/bin/_docs_components/html.md index 35d566120..b2aa2c50f 100644 --- a/bin/_docs_components/html.md +++ b/bin/_docs_components/html.md @@ -128,7 +128,9 @@ HTML; $tags = new WP_HTML_Tag_Processor( $untrusted ); while ( $tags->next_tag() ) { - if ( 'SCRIPT' === $tags->get_tag() && ! $tags->is_tag_closer() ) { + // next_tag() never lands on closing tags, so no is_tag_closer() guard + // is needed here. + if ( 'SCRIPT' === $tags->get_tag() ) { $tags->set_modifiable_text( '' ); } foreach ( $tags->get_attribute_names_with_prefix( 'on' ) as $attr ) { @@ -168,7 +170,7 @@ HTML; $tags = new WP_HTML_Tag_Processor( $html ); while ( $tags->next_tag() ) { $tag = $tags->get_tag(); - if ( ( 'SCRIPT' === $tag || 'STYLE' === $tag ) && ! $tags->is_tag_closer() ) { + if ( 'SCRIPT' === $tag || 'STYLE' === $tag ) { $tags->set_attribute( 'nonce', $nonce ); } } @@ -237,9 +239,11 @@ require '/wordpress/wp-content/php-toolkit/vendor/autoload.php'; echo "attribute: " . WP_HTML_Decoder::decode_attribute( 'path?a=1&b=2©' ) . "\n"; echo "text: " . WP_HTML_Decoder::decode_text_node( 'AT&T — 100% 😀' ) . "\n"; -// Safe URL prefix check that respects encoded colons (a classic XSS vector). +// Safe URL prefix check that decodes character references while comparing. +// `j` is the letter `j`, so this string really does start with javascript:. +// strpos() would miss it. $is_javascript = WP_HTML_Decoder::attribute_starts_with( - 'java script:alert(1)', + 'javascript:alert(1)', 'javascript:', 'ascii-case-insensitive' ); @@ -250,7 +254,7 @@ var_dump( $is_javascript ); ``` attribute: path?a=1&b=2© text: AT&T — 100% 😀 -bool(false) +bool(true) ``` ## Find images by ancestry with breadcrumbs @@ -400,11 +404,11 @@ echo $tags->get_updated_html();
WP_HTML_Tag_ProcessorWP_HTML_Processor::create_fragment()breadcrumbs), heading outline extraction, anything that needs to know "is this tag inside that one."WP_HTML_Decoder::decode_text_node()AT&T) back into raw text correctly. Implements the HTML5 entity algorithm — don't roll your own.WP_HTML_Decoder::attribute_starts_with()java	script:). The classic strpos approach misses these.WP_HTML_Decoder::attribute_starts_with()javascript: (where a is the letter a) is correctly recognized as starting with javascript:. The classic strpos approach misses these.Footgun: Tag closers are visited too. next_tag() stops on both opening and closing tags. For most attribute-rewriting code, gate with ! $tags->is_tag_closer() so you don't try to set attributes on a </script>.
Footgun: next_tag() only stops on opening tags. Closers and text are skipped, so a guard like ! $tags->is_tag_closer() inside a next_tag() loop is harmless but never fires. If you need to visit closing tags or text nodes, use next_token() instead and check get_token_type().
Footgun: Tag-name matches are uppercase. get_tag() always returns the tag name in uppercase ('IMG', not 'img'). Compare accordingly. The filter argument to next_tag() is case-insensitive in either direction.
Footgun: Don't confuse WP_HTML_Tag_Processor with the full processor. The cursor is forward-only and ancestry-blind. If you call get_breadcrumbs() on it, you'll get a thin shape that doesn't reflect HTML5 tree construction — implicit <tbody> insertion, automatic <p> closing, and the rest live only in WP_HTML_Processor.
Footgun: Don't confuse WP_HTML_Tag_Processor with the full processor. The cursor is forward-only and ancestry-blind, and it doesn't expose get_breadcrumbs() at all — calling that on a WP_HTML_Tag_Processor raises a Call to undefined method error. Breadcrumbs and HTML5 tree construction (implicit <tbody> insertion, automatic <p> closing, and the rest) live only on WP_HTML_Processor.
Feed Markdown into MarkdownConsumer, get block markup back. The result is a BlocksWithMetadata object that holds both the rendered blocks and any frontmatter parsed from the document.
Feed Markdown into MarkdownConsumer, get block markup back. The result is a BlocksWithMetadata object (defined in WordPress\DataLiberation\DataFormatConsumer — the shared shape every DataFormatConsumer in the toolkit emits) that holds both the rendered blocks and any frontmatter parsed from the document.