From d10f3b20f436868ac4fe5667c39d4ac5a8920d24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 22:57:52 +0100
Subject: [PATCH 1/4] [XMLProcessor] ASCII fast path for parsing names

---
 components/XML/class-xmlprocessor.php | 103 +++++++++++++++++++-------
 1 file changed, 78 insertions(+), 25 deletions(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 5c15f8eb..40f6ef5b 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -5,7 +5,8 @@
 use WP_HTML_Span;
 use WP_HTML_Text_Replacement;
 
-use function WordPress\Encoding\utf8_codepoint_at;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
+use function WordPress\Encoding\utf8_ord;
 
 /**
  * XML API: XMLProcessor class
@@ -43,9 +44,9 @@
  *
  * @TODO: Support XML 1.1.
  *
- * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring
+ * @TODO: Evaluate the performance of _wp_scan_utf8() against using the mbstring
  *        extension. If mbstring is faster, then use it whenever it's available with
- *        utf8_codepoint_at() as a fallback.
+ *        _wp_scan_utf8() as a fallback.
  *
  * @package WordPress
  * @subpackage HTML-API
@@ -2328,17 +2329,25 @@ private function skip_whitespace() {
 	 * @return int
 	 */
 	private function parse_name( $offset ) {
-		static $i         = 0;
 		$name_byte_length = 0;
+		$at               = $offset;
+
+		// Fast path: consume any ASCII NameStartChar bytes.
+		$name_byte_length += strspn(
+			$this->xml,
+			':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
+			$offset + $name_byte_length,
+			1
+		);
+
 		while ( true ) {
 			/**
 			 * Parse the next unicode codepoint.
 			 *
-			 * We use a custom UTF-8 decoder here. No other method
-			 * is reliable and available enough to depend on it in
-			 * WordPress core:
+			 * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method
+			 * is reliable and available enough to depend on it in WordPress core:
 			 *
-			 * * mb_ord() – is not available on all hosts.
+			 * * mb_ord() – is only available on 95% of hosts.
 			 * * iconv_substr() – is not available on all hosts.
 			 * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
 			 *                  contains an incomplete UTF-8 byte sequence – even
@@ -2348,28 +2357,72 @@ private function parse_name( $offset ) {
 			 *                  how to reliably reproduce this failure mode in a
 			 *                  unit test.
 			 *
-			 * Performance-wise, character-by-character processing via utf8_codepoint_at
-			 * is still much faster than relying on preg_match(). The mbstring extension
-			 * is likely faster. It would be interesting to evaluate the performance
-			 * and prefer mbstring whenever it's available.
+			 * Performance-wise, character-by-character processing via _wp_scan_utf8
+			 * is pretty slow. The ASCII fast path below enables skipping most of the
+			 * UTF-8 decoder calls.
+			 *
+			 * If the UTF-8 decoder performance ever becomes a bottleneck, there is a
+			 * few ways to significantly improve it:
+			 *
+			 * * Call a native grapheme_ function when available.
+			 * * Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing.
+			 *   It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`,
+			 *   that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at`
+			 *   function could be restored if its codepoint-by-codepoint decoding performance is
+			 *   better than the _wp_scan_utf8.
+			 */
+
+			/**
+			 * The ASCII speedup includes all ASCII NameStartChar, which are also valid
+			 * NameChar, making it possible to quickly scan past these bytes without
+			 * further processing.
+			 */
+			$name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length );
+
+			/*
+			 * Quickly check if the next byte is an ASCII byte that is not allowed in XML
+			 * NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder.
+			 *
+			 * Even though this does not seem to be different from the ASCII fast path in the
+			 * _wp_scan_utf8 function, skipping that function call still provides a ~50% speed
+			 * improvement.
 			 */
-			$codepoint = utf8_codepoint_at(
+			$is_non_name_ascii_byte = strspn(
 				$this->xml,
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+				"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+				" !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f",
 				$offset + $name_byte_length,
-				$bytes_parsed
-			);
-			if (
-				// Byte sequence is not a valid UTF-8 codepoint.
-				( 0xFFFD === $codepoint && 0 === $bytes_parsed ) ||
-				// No codepoint at the given offset.
-				null === $codepoint ||
-				// The codepoint is not a valid part of an XML NameChar or NameStartChar.
-				! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length )
-			) {
+				1
+			) > 0;
+			if ( $is_non_name_ascii_byte ) {
+				break;
+			}
+
+			// EOF.
+			if ( $offset + $name_byte_length >= strlen( $this->xml ) ) {
+				break;
+			}
+
+			// The next byte sequence is, very likely, a UTF-8 codepoint. Let's
+			// try to decode it.
+			$at             = $offset + $name_byte_length;
+			$new_at         = $at;
+			$invalid_length = 0;
+			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
+				// EOF or invalid utf-8 byte sequence.
+				break;
+			}
+
+			$codepoint_byte_length = $new_at - $at;
+			$codepoint             = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) );
+
+			// The codepoint is not a valid part of an XML NameChar or NameStartChar.
+			if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
 				break;
 			}
-			$codepoint         = null;
-			$name_byte_length += $bytes_parsed;
+			$name_byte_length += $codepoint_byte_length;
+			$at                = $new_at;
 		}
 
 		return $name_byte_length;

From 9088c622b28a533021a2647ea8fec9964b09aa42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 23:29:43 +0100
Subject: [PATCH 2/4] Update docstrings for XMLProcessor

---
 components/XML/class-xmlprocessor.php | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 40f6ef5b..4964a86e 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -23,6 +23,9 @@
  * * UTF-8 encoded
  * * Not standalone (so can use external entities)
  * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them)
+ * 
+ * XML 1.1 is explicitly not a design goal here. Version 1.1 is
+ * more complex specification and not so widely supported.
  *
  * ### Possible future direction for this module
  *
@@ -42,12 +45,6 @@
  *        * <!NOTATION, see https://www.w3.org/TR/xml/#sec-entity-decl
  *        * Conditional sections, see https://www.w3.org/TR/xml/#sec-condition-sect
  *
- * @TODO: Support XML 1.1.
- *
- * @TODO: Evaluate the performance of _wp_scan_utf8() against using the mbstring
- *        extension. If mbstring is faster, then use it whenever it's available with
- *        _wp_scan_utf8() as a fallback.
- *
  * @package WordPress
  * @subpackage HTML-API
  * @since WP_VERSION

From 5d4aa946c3c9daab35d916d039e24b58311a7817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 23:34:05 +0100
Subject: [PATCH 3/4] remove space

Added a blank line for better readability in comments.
---
 components/XML/class-xmlprocessor.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 4964a86e..d442f681 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -23,7 +23,7 @@
  * * UTF-8 encoded
  * * Not standalone (so can use external entities)
  * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them)
- * 
+ *
  * XML 1.1 is explicitly not a design goal here. Version 1.1 is
  * more complex specification and not so widely supported.
  *

From 7506e56671dcbbbd4c2cc24c51527f60abfd1264 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sun, 2 Nov 2025 01:04:19 +0100
Subject: [PATCH 4/4] Refactor XML processor comments for consistency

---
 components/XML/class-xmlprocessor.php | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index d442f681..c2998a05 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -18,11 +18,11 @@
  * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/)
  * and supports XML documents with the following characteristics:
  *
- * * XML 1.0
- * * Well-formed
- * * UTF-8 encoded
- * * Not standalone (so can use external entities)
- * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them)
+ * – XML 1.0
+ * – Well-formed
+ * – UTF-8 encoded
+ * – Not standalone (so can use external entities)
+ * – No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them)
  *
  * XML 1.1 is explicitly not a design goal here. Version 1.1 is
  * more complex specification and not so widely supported.
@@ -1196,8 +1196,8 @@ protected function parse_next_token() {
 			/**
 			 * Compute fully qualified attributes and assert:
 			 *
-			 * * All attributes have valid namespaces.
-			 * * No two attributes have the same (local name, namespace) pair.
+			 * – All attributes have valid namespaces.
+			 * – No two attributes have the same (local name, namespace) pair.
 			 *
 			 * @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs
 			 */
@@ -1688,8 +1688,8 @@ private function parse_next_tag() {
 			 * names.
 			 *
 			 * Reference:
-			 * * https://www.w3.org/TR/xml/#NT-STag
-			 * * https://www.w3.org/TR/xml/#NT-Name
+			 * – https://www.w3.org/TR/xml/#NT-STag
+			 * – https://www.w3.org/TR/xml/#NT-Name
 			 */
 			$tag_name_length = $this->parse_name( $at + 1 );
 			if ( false === $tag_name_length ) {
@@ -2344,9 +2344,9 @@ private function parse_name( $offset ) {
 			 * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method
 			 * is reliable and available enough to depend on it in WordPress core:
 			 *
-			 * * mb_ord() – is only available on 95% of hosts.
-			 * * iconv_substr() – is not available on all hosts.
-			 * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
+			 * – mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts.
+			 * – iconv_substr() – is not available on all hosts.
+			 * – preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
 			 *                  contains an incomplete UTF-8 byte sequence – even
 			 *                  when that sequence comes after a valid match. This
 			 *                  failure mode cannot be reproduced with just any string.
@@ -2358,11 +2358,11 @@ private function parse_name( $offset ) {
 			 * is pretty slow. The ASCII fast path below enables skipping most of the
 			 * UTF-8 decoder calls.
 			 *
-			 * If the UTF-8 decoder performance ever becomes a bottleneck, there is a
+			 * If the UTF-8 decoder performance ever becomes a bottleneck, there are a
 			 * few ways to significantly improve it:
 			 *
-			 * * Call a native grapheme_ function when available.
-			 * * Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing.
+			 * – Call a native grapheme_ function when available.
+			 * – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing.
 			 *   It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`,
 			 *   that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at`
 			 *   function could be restored if its codepoint-by-codepoint decoding performance is