From d10f3b20f436868ac4fe5667c39d4ac5a8920d24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 1 Nov 2025 22:57:52 +0100 Subject: [PATCH 1/4] [XMLProcessor] ASCII fast path for parsing names --- components/XML/class-xmlprocessor.php | 103 +++++++++++++++++++------- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 5c15f8eb..40f6ef5b 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -5,7 +5,8 @@ use WP_HTML_Span; use WP_HTML_Text_Replacement; -use function WordPress\Encoding\utf8_codepoint_at; +use function WordPress\Encoding\compat\_wp_scan_utf8; +use function WordPress\Encoding\utf8_ord; /** * XML API: XMLProcessor class @@ -43,9 +44,9 @@ * * @TODO: Support XML 1.1. * - * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring + * @TODO: Evaluate the performance of _wp_scan_utf8() against using the mbstring * extension. If mbstring is faster, then use it whenever it's available with - * utf8_codepoint_at() as a fallback. + * _wp_scan_utf8() as a fallback. * * @package WordPress * @subpackage HTML-API @@ -2328,17 +2329,25 @@ private function skip_whitespace() { * @return int */ private function parse_name( $offset ) { - static $i = 0; $name_byte_length = 0; + $at = $offset; + + // Fast path: consume any ASCII NameStartChar bytes. + $name_byte_length += strspn( + $this->xml, + ':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz', + $offset + $name_byte_length, + 1 + ); + while ( true ) { /** * Parse the next unicode codepoint. * - * We use a custom UTF-8 decoder here. No other method - * is reliable and available enough to depend on it in - * WordPress core: + * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method + * is reliable and available enough to depend on it in WordPress core: * - * * mb_ord() – is not available on all hosts. + * * mb_ord() – is only available on 95% of hosts. * * iconv_substr() – is not available on all hosts. * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input * contains an incomplete UTF-8 byte sequence – even @@ -2348,28 +2357,72 @@ private function parse_name( $offset ) { * how to reliably reproduce this failure mode in a * unit test. * - * Performance-wise, character-by-character processing via utf8_codepoint_at - * is still much faster than relying on preg_match(). The mbstring extension - * is likely faster. It would be interesting to evaluate the performance - * and prefer mbstring whenever it's available. + * Performance-wise, character-by-character processing via _wp_scan_utf8 + * is pretty slow. The ASCII fast path below enables skipping most of the + * UTF-8 decoder calls. + * + * If the UTF-8 decoder performance ever becomes a bottleneck, there is a + * few ways to significantly improve it: + * + * * Call a native grapheme_ function when available. + * * Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing. + * It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`, + * that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at` + * function could be restored if its codepoint-by-codepoint decoding performance is + * better than the _wp_scan_utf8. + */ + + /** + * The ASCII speedup includes all ASCII NameStartChar, which are also valid + * NameChar, making it possible to quickly scan past these bytes without + * further processing. + */ + $name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length ); + + /* + * Quickly check if the next byte is an ASCII byte that is not allowed in XML + * NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder. + * + * Even though this does not seem to be different from the ASCII fast path in the + * _wp_scan_utf8 function, skipping that function call still provides a ~50% speed + * improvement. */ - $codepoint = utf8_codepoint_at( + $is_non_name_ascii_byte = strspn( $this->xml, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . + " !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f", $offset + $name_byte_length, - $bytes_parsed - ); - if ( - // Byte sequence is not a valid UTF-8 codepoint. - ( 0xFFFD === $codepoint && 0 === $bytes_parsed ) || - // No codepoint at the given offset. - null === $codepoint || - // The codepoint is not a valid part of an XML NameChar or NameStartChar. - ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) - ) { + 1 + ) > 0; + if ( $is_non_name_ascii_byte ) { + break; + } + + // EOF. + if ( $offset + $name_byte_length >= strlen( $this->xml ) ) { + break; + } + + // The next byte sequence is, very likely, a UTF-8 codepoint. Let's + // try to decode it. + $at = $offset + $name_byte_length; + $new_at = $at; + $invalid_length = 0; + if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) { + // EOF or invalid utf-8 byte sequence. + break; + } + + $codepoint_byte_length = $new_at - $at; + $codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) ); + + // The codepoint is not a valid part of an XML NameChar or NameStartChar. + if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) { break; } - $codepoint = null; - $name_byte_length += $bytes_parsed; + $name_byte_length += $codepoint_byte_length; + $at = $new_at; } return $name_byte_length; From 9088c622b28a533021a2647ea8fec9964b09aa42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 1 Nov 2025 23:29:43 +0100 Subject: [PATCH 2/4] Update docstrings for XMLProcessor --- components/XML/class-xmlprocessor.php | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 40f6ef5b..4964a86e 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -23,6 +23,9 @@ * * UTF-8 encoded * * Not standalone (so can use external entities) * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) + * + * XML 1.1 is explicitly not a design goal here. Version 1.1 is + * more complex specification and not so widely supported. * * ### Possible future direction for this module * @@ -42,12 +45,6 @@ * * Date: Sat, 1 Nov 2025 23:34:05 +0100 Subject: [PATCH 3/4] remove space Added a blank line for better readability in comments. --- components/XML/class-xmlprocessor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 4964a86e..d442f681 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -23,7 +23,7 @@ * * UTF-8 encoded * * Not standalone (so can use external entities) * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) - * + * * XML 1.1 is explicitly not a design goal here. Version 1.1 is * more complex specification and not so widely supported. * From 7506e56671dcbbbd4c2cc24c51527f60abfd1264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 01:04:19 +0100 Subject: [PATCH 4/4] Refactor XML processor comments for consistency --- components/XML/class-xmlprocessor.php | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index d442f681..c2998a05 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -18,11 +18,11 @@ * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/) * and supports XML documents with the following characteristics: * - * * XML 1.0 - * * Well-formed - * * UTF-8 encoded - * * Not standalone (so can use external entities) - * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) + * – XML 1.0 + * – Well-formed + * – UTF-8 encoded + * – Not standalone (so can use external entities) + * – No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) * * XML 1.1 is explicitly not a design goal here. Version 1.1 is * more complex specification and not so widely supported. @@ -1196,8 +1196,8 @@ protected function parse_next_token() { /** * Compute fully qualified attributes and assert: * - * * All attributes have valid namespaces. - * * No two attributes have the same (local name, namespace) pair. + * – All attributes have valid namespaces. + * – No two attributes have the same (local name, namespace) pair. * * @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs */ @@ -1688,8 +1688,8 @@ private function parse_next_tag() { * names. * * Reference: - * * https://www.w3.org/TR/xml/#NT-STag - * * https://www.w3.org/TR/xml/#NT-Name + * – https://www.w3.org/TR/xml/#NT-STag + * – https://www.w3.org/TR/xml/#NT-Name */ $tag_name_length = $this->parse_name( $at + 1 ); if ( false === $tag_name_length ) { @@ -2344,9 +2344,9 @@ private function parse_name( $offset ) { * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method * is reliable and available enough to depend on it in WordPress core: * - * * mb_ord() – is only available on 95% of hosts. - * * iconv_substr() – is not available on all hosts. - * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input + * – mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts. + * – iconv_substr() – is not available on all hosts. + * – preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input * contains an incomplete UTF-8 byte sequence – even * when that sequence comes after a valid match. This * failure mode cannot be reproduced with just any string. @@ -2358,11 +2358,11 @@ private function parse_name( $offset ) { * is pretty slow. The ASCII fast path below enables skipping most of the * UTF-8 decoder calls. * - * If the UTF-8 decoder performance ever becomes a bottleneck, there is a + * If the UTF-8 decoder performance ever becomes a bottleneck, there are a * few ways to significantly improve it: * - * * Call a native grapheme_ function when available. - * * Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing. + * – Call a native grapheme_ function when available. + * – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing. * It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`, * that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at` * function could be restored if its codepoint-by-codepoint decoding performance is