From 32b5b256eba87413e5800d4f195674d6c70004da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Fri, 31 Oct 2025 13:43:44 +0100
Subject: [PATCH 01/28] Migrate to the new utf8.php decoder

---
 components/Blueprints/class-runner.php |   9 +-
 components/Encoding/compat-utf8.php    | 567 +++++++++++++++++
 components/Encoding/composer.json      |  68 ++-
 components/Encoding/utf8-decoder.php   | 809 +------------------------
 components/Encoding/utf8.php           | 179 ++++++
 5 files changed, 788 insertions(+), 844 deletions(-)
 create mode 100644 components/Encoding/compat-utf8.php
 create mode 100644 components/Encoding/utf8.php

diff --git a/components/Blueprints/class-runner.php b/components/Blueprints/class-runner.php
index 4d7ab908..c4471d70 100644
--- a/components/Blueprints/class-runner.php
+++ b/components/Blueprints/class-runner.php
@@ -55,7 +55,7 @@
 use WordPress\HttpClient\Client;
 use WordPress\Zip\ZipFilesystem;
 
-use function WordPress\Encoding\_wp_has_noncharacters_fallback;
+use function WordPress\Encoding\wp_has_noncharacters;
 use function WordPress\Filesystem\wp_unix_sys_get_temp_dir;
 use function WordPress\Zip\is_zip_file_stream;
 
@@ -379,12 +379,7 @@ private function load_blueprint() {
 		// Validate the Blueprint string we've just loaded.
 
 		// **UTF-8 Encoding:** Assert the Blueprint input is UTF-8 encoded.
-		$is_valid_utf8 = false;
-		if ( function_exists( 'mb_check_encoding' ) ) {
-			$is_valid_utf8 = mb_check_encoding( $blueprint_string, 'UTF-8' );
-		} else {
-			$is_valid_utf8 = ! _wp_has_noncharacters_fallback( $blueprint_string );
-		}
+		$is_valid_utf8 = ! wp_has_noncharacters( $blueprint_string );
 
 		if ( ! $is_valid_utf8 ) {
 			throw new BlueprintExecutionException( 'Blueprint must be encoded as UTF-8.' );
diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php
new file mode 100644
index 00000000..ab6e8cd0
--- /dev/null
+++ b/components/Encoding/compat-utf8.php
@@ -0,0 +1,567 @@
+<?php
+
+namespace WordPress\Encoding;
+
+/**
+ * Finds spans of valid and invalid UTF-8 bytes in a given string.
+ *
+ * This is a low-level tool to power various UTF-8 functionality.
+ * It scans through a string until it finds invalid byte spans.
+ * When it does this, it does three things:
+ *
+ *  - Assigns `$at` to the position after the last successful code point.
+ *  - Assigns `$invalid_length` to the length of the maximal subpart of
+ *    the invalid bytes starting at `$at`.
+ *  - Returns how many code points were successfully scanned.
+ *
+ * This information is enough to build a number of useful UTF-8 functions.
+ *
+ * Example:
+ *
+ *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
+ *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
+ *     $at = $invalid_length = 0;
+ *
+ *     // The first step finds the invalid 0xF1 byte.
+ *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 2; $invalid_length === 1;
+ *
+ *     // The second step continues to the end of the string.
+ *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 4; $invalid_length === 0;
+ *
+ * Note! While passing an options array here might be convenient from a calling-code standpoint,
+ *       this function is intended to serve as a very low-level foundation upon which to build
+ *       higher level functionality. For the sake of keeping costs explicit all arguments are
+ *       passed directly.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string    $bytes             UTF-8 encoded string which might include invalid spans of bytes.
+ * @param int       $at                Where to start scanning.
+ * @param int       $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
+ * @param int|null  $max_bytes         Stop scanning after this many bytes have been seen.
+ * @param int|null  $max_code_points   Stop scanning after this many code points have been seen.
+ * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
+ * @return int How many code points were successfully scanned.
+ */
+function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+	$byte_length       = strlen( $bytes );
+	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
+	$invalid_length    = 0;
+	$count             = 0;
+	$max_count         = $max_code_points ?? PHP_INT_MAX;
+	$has_noncharacters = false;
+
+	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
+		/*
+		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
+		 *
+		 * This optimization step improves the speed from 10x to 100x
+		 * depending on whether the JIT has optimized the function.
+		 */
+		$ascii_byte_count = strspn(
+			$bytes,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$i,
+			$end - $i
+		);
+
+		if ( $count + $ascii_byte_count >= $max_count ) {
+			$at    = $i + ( $max_count - $count );
+			$count = $max_count;
+			return $count;
+		}
+
+		$count += $ascii_byte_count;
+		$i     += $ascii_byte_count;
+
+		if ( $i >= $end ) {
+			$at = $end;
+			return $count;
+		}
+
+		/**
+		 * The above fast-track handled all single-byte UTF-8 characters. What
+		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
+		 *
+		 * Therefore everything past here is checking those multibyte sequences.
+		 *
+		 * It may look like there’s a need to check against the max bytes here,
+		 * but since each match of a single character returns, this functions will
+		 * bail already if crossing the max-bytes threshold. This function SHALL
+		 * NOT return in the middle of a multi-byte character, so if a character
+		 * falls on each side of the max bytes, the entire character will be scanned.
+		 *
+		 * Because it’s possible that there are truncated characters, the use of
+		 * the null-coalescing operator with "\xC0" is a convenience for skipping
+		 * length checks on every continuation bytes. This works because 0xC0 is
+		 * always invalid in a UTF-8 string, meaning that if the string has been
+		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
+		 *
+		 * > [The following table] lists all of the byte sequences that are well-formed
+		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
+		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
+		 * > outside of the ranges listed is ill-formed.
+		 *
+		 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
+		 *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
+		 *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
+		 *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
+		 *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
+		 *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
+		 *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
+		 *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
+		 *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
+		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
+		 *
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
+		 */
+
+		// Valid two-byte code points.
+		$b1 = ord( $bytes[ $i ] );
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+
+		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+			++$count;
+			++$i;
+			continue;
+		}
+
+		// Valid three-byte code points.
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		if ( $b3 < 0x80 || $b3 > 0xBF ) {
+			goto invalid_utf8;
+		}
+
+		if (
+			( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+		) {
+			++$count;
+			$i += 2;
+
+			// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
+			if ( 0xEF === $b1 ) {
+				$has_noncharacters |= (
+					( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
+					( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
+				);
+			}
+
+			continue;
+		}
+
+		// Valid four-byte code points.
+		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
+
+		if ( $b4 < 0x80 || $b4 > 0xBF ) {
+			goto invalid_utf8;
+		}
+
+		if (
+			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+		) {
+			++$count;
+			$i += 3;
+
+			// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
+			$has_noncharacters |= (
+				( 0x0F === ( $b2 & 0x0F ) ) &&
+				0xBF === $b3 &&
+				( 0xBE === $b4 || 0xBF === $b4 )
+			);
+
+			continue;
+		}
+
+		/**
+		 * When encountering invalid byte sequences, Unicode suggests finding the
+		 * maximal subpart of a text and replacing that subpart with a single
+		 * replacement character.
+		 *
+		 * > This practice is more secure because it does not result in the
+		 * > conversion consuming parts of valid sequences as though they were
+		 * > invalid. It also guarantees at least one replacement character will
+		 * > occur for each instance of an invalid sequence in the original text.
+		 * > Furthermore, this practice can be defined consistently for better
+		 * > interoperability between different implementations of conversion.
+		 *
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
+		 */
+		invalid_utf8:
+		$at             = $i;
+		$invalid_length = 1;
+
+		// Single-byte and two-byte characters.
+		if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
+			return $count;
+		}
+
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		// Find the maximal subpart and skip past it.
+		if ( 0xE0 === ( $b1 & 0xF0 ) ) {
+			// Three-byte characters.
+			$b2_valid = (
+				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+			);
+
+			$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
+			return $count;
+		} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
+			// Four-byte characters.
+			$b2_valid = (
+				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+			);
+
+			$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
+
+			$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
+			return $count;
+		}
+
+		return $count;
+	}
+
+	$at = $i;
+	return $count;
+}
+
+/**
+ * Fallback mechanism for safely validating UTF-8 bytes.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see wp_is_valid_utf8()
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function _wp_is_valid_utf8_fallback( string $bytes ): bool {
+	$bytes_length = strlen( $bytes );
+	if ( 0 === $bytes_length ) {
+		return true;
+	}
+
+	$next_byte_at   = 0;
+	$invalid_length = 0;
+
+	_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+
+	return $bytes_length === $next_byte_at && 0 === $invalid_length;
+}
+
+/**
+ * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
+ *
+ * Example:
+ *
+ *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see wp_scrub_utf8()
+ *
+ * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
+ * @return string Input string with spans of invalid bytes swapped with the replacement character.
+ */
+function _wp_scrub_utf8_fallback( string $bytes ): string {
+	$bytes_length   = strlen( $bytes );
+	$next_byte_at   = 0;
+	$was_at         = 0;
+	$invalid_length = 0;
+	$scrubbed       = '';
+
+	while ( $next_byte_at <= $bytes_length ) {
+		_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+
+		if ( $next_byte_at >= $bytes_length ) {
+			if ( 0 === $was_at ) {
+				return $bytes;
+			}
+
+			return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
+		}
+
+		$scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
+		$scrubbed .= "\u{FFFD}";
+
+		$next_byte_at += $invalid_length;
+		$was_at        = $next_byte_at;
+	}
+
+	return $scrubbed;
+}
+
+/**
+ * Returns how many code points are found in the given UTF-8 string.
+ *
+ * Invalid spans of bytes count as a single code point according
+ * to the maximal subpart rule. This function is a fallback method
+ * for calling `mb_strlen( $text, 'UTF-8' )`.
+ *
+ * When negative values are provided for the byte offsets or length,
+ * this will always report zero code points.
+ *
+ * Example:
+ *
+ *     4  === _wp_utf8_codepoint_count( 'text' );
+ *
+ *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
+ *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text            Count code points in this string.
+ * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
+ * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
+ *                                Default is to scan until the end of the string. Must be positive.
+ * @return int How many code points were found.
+ */
+function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+	if ( $byte_offset < 0 ) {
+		return 0;
+	}
+
+	$count           = 0;
+	$at              = $byte_offset;
+	$end             = strlen( $text );
+	$invalid_length  = 0;
+	$max_byte_length = min( $end - $at, $max_byte_length );
+
+	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
+		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
+		$count += $invalid_length > 0 ? 1 : 0;
+		$at    += $invalid_length;
+	}
+
+	return $count;
+}
+
+/**
+ * Given a starting offset within a string and a maximum number of code points,
+ * return how many bytes are occupied by the span of characters.
+ *
+ * Invalid spans of bytes count as a single code point according to the maximal
+ * subpart rule. This function is a fallback method for calling
+ * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text              Count bytes of span in this text.
+ * @param int    $byte_offset       Start counting at this byte offset.
+ * @param int    $max_code_points   Stop counting after this many code points have been seen,
+ *                                  or at the end of the string.
+ * @param ?int   $found_code_points Optional. Will be set to number of found code points in
+ *                                  span, as this might be smaller than the maximum count if
+ *                                  the string is not long enough.
+ * @return int Number of bytes spanned by the code points.
+ */
+function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+	$was_at            = $byte_offset;
+	$invalid_length    = 0;
+	$end               = strlen( $text );
+	$found_code_points = 0;
+
+	while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
+		$needed      = $max_code_points - $found_code_points;
+		$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
+
+		$found_code_points += $chunk_count;
+
+		// Invalid spans only convey one code point count regardless of how long they are.
+		if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
+			++$found_code_points;
+			$byte_offset += $invalid_length;
+		}
+	}
+
+	return $byte_offset - $was_at;
+}
+
+/**
+ * Fallback support for determining if a string contains Unicode noncharacters.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \wp_has_noncharacters()
+ *
+ * @param string $text Are there noncharacters in this string?
+ * @return bool Whether noncharacters were found in the string.
+ */
+function _wp_has_noncharacters_fallback( string $text ): bool {
+	$at                = 0;
+	$invalid_length    = 0;
+	$has_noncharacters = false;
+	$end               = strlen( $text );
+
+	while ( $at < $end && ! $has_noncharacters ) {
+		_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
+		$at += $invalid_length;
+	}
+
+	return $has_noncharacters;
+}
+
+/**
+ * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \utf8_encode()
+ *
+ * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
+ * @return string Text converted into UTF-8.
+ */
+function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
+	$iso_8859_1_text = (string) $iso_8859_1_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $iso_8859_1_text );
+	$utf8            = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$iso_8859_1_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		// All other bytes transform into two-byte UTF-8 sequences.
+		$code_point = ord( $iso_8859_1_text[ $at ] );
+		$byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
+		$byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
+
+		$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
+		$utf8 .= "{$byte1}{$byte2}";
+
+		++$at;
+		$was_at = $at;
+	}
+
+	if ( 0 === $was_at ) {
+		return $iso_8859_1_text;
+	}
+
+	$utf8 .= substr( $iso_8859_1_text, $was_at );
+	return $utf8;
+}
+
+/**
+ * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \utf8_decode()
+ *
+ * @param string $utf8_text Text treated as UTF-8 bytes.
+ * @return string Text converted into ISO-8859-1.
+ */
+function _wp_utf8_decode_fallback( $utf8_text ) {
+	$utf8_text       = (string) $utf8_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $utf8_text );
+	$iso_8859_1_text = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$utf8_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		$next_at        = $at;
+		$invalid_length = 0;
+		$found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
+		$span_length    = $next_at - $at;
+		$next_byte      = '?';
+
+		if ( 1 !== $found ) {
+			if ( $invalid_length > 0 ) {
+				$next_byte = '';
+				goto flush_sub_part;
+			}
+
+			break;
+		}
+
+		// All convertible code points are two-bytes long.
+		$byte1 = ord( $utf8_text[ $at ] );
+		if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
+			goto flush_sub_part;
+		}
+
+		// All convertible code points are not greater than U+FF.
+		$byte2      = ord( $utf8_text[ $at + 1 ] );
+		$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
+		if ( $code_point > 0xFF ) {
+			goto flush_sub_part;
+		}
+
+		$next_byte = chr( $code_point );
+
+		flush_sub_part:
+		$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
+		$iso_8859_1_text .= $next_byte;
+		$at              += $span_length;
+		$was_at           = $at;
+
+		if ( $invalid_length > 0 ) {
+			$iso_8859_1_text .= '?';
+			$at              += $invalid_length;
+			$was_at           = $at;
+		}
+	}
+
+	if ( 0 === $was_at ) {
+		return $utf8_text;
+	}
+
+	$iso_8859_1_text .= substr( $utf8_text, $was_at );
+	return $iso_8859_1_text;
+}
diff --git a/components/Encoding/composer.json b/components/Encoding/composer.json
index 1bf6c1da..eba3afd0 100644
--- a/components/Encoding/composer.json
+++ b/components/Encoding/composer.json
@@ -1,35 +1,37 @@
 {
-    "name": "wp-php-toolkit/encoding",
-    "description": "Encoding component for WordPress.",
-    "type": "library",
-    "license": "GPL-2.0-or-later",
-    "authors": [
-        {
-            "name": "Adam Zielinski",
-            "email": "adam@adamziel.com"
-        },
-        {
-            "name": "WordPress Team",
-            "email": "wordpress@wordpress.org"
-        }
-    ],
-    "require": {
-        "php": ">=7.2"
-    },
-    "autoload": {
-        "files": [
-            "utf8-decoder.php",
-            "utf8-encoder.php"
-        ],
-        "exclude-from-classmap": [
-            "/Tests/"
-        ]
-    },
-    "archive": {
-        "exclude": [
-            "**/.github/",
-            "**/Tests/",
-            "**/bin/"
-        ]
-    }
+	"name": "wp-php-toolkit/encoding",
+	"description": "Encoding component for WordPress.",
+	"type": "library",
+	"license": "GPL-2.0-or-later",
+	"authors": [
+		{
+			"name": "Adam Zielinski",
+			"email": "adam@adamziel.com"
+		},
+		{
+			"name": "WordPress Team",
+			"email": "wordpress@wordpress.org"
+		}
+	],
+	"require": {
+		"php": ">=7.2"
+	},
+	"autoload": {
+		"files": [
+			"utf8.php",
+			"compat-utf8.php",
+			"utf8-decoder.php",
+			"utf8-encoder.php"
+		],
+		"exclude-from-classmap": [
+			"/Tests/"
+		]
+	},
+	"archive": {
+		"exclude": [
+			"**/.github/",
+			"**/Tests/",
+			"**/bin/"
+		]
+	}
 }
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index e339e710..74e685e3 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -10,786 +10,6 @@
  * without crashing or depending on the mbstring extension.
  */
 
-if ( ! defined( 'UTF8_DECODER_ACCEPT' ) ) {
-	define( 'UTF8_DECODER_ACCEPT', 0 );
-}
-
-if ( ! defined( 'UTF8_DECODER_REJECT' ) ) {
-	define( 'UTF8_DECODER_REJECT', 1 );
-}
-
-/**
- * Finds spans of valid and invalid UTF-8 bytes in a given string.
- *
- * This is a low-level tool to power various UTF-8 functionality.
- * It scans through a string until it finds invalid byte spans.
- * When it does this, it does three things:
- *
- *  - Assigns `$at` to the position after the last successful code point.
- *  - Assigns `$invalid_length` to the length of the maximal subpart of
- *    the invalid bytes starting at `$at`.
- *  - Returns how many code points were successfully scanned.
- *
- * This information is enough to build a number of useful UTF-8 functions.
- *
- * Example:
- *
- *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
- *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
- *     $at = $invalid_length = 0;
- *
- *     // The first step finds the invalid 0xF1 byte.
- *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
- *     $at === 2; $invalid_length === 1;
- *
- *     // The second step continues to the end of the string.
- *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
- *     $at === 4; $invalid_length === 0;
- *
- * Note! While passing an options array here might be convenient from a calling-code standpoint,
- *       this function is intended to serve as a very low-level foundation upon which to build
- *       higher level functionality. For the sake of keeping costs explicit all arguments are
- *       passed directly.
- *
- * @since 6.9.0
- * @access private
- *
- * @param string    $bytes             UTF-8 encoded string which might include invalid spans of bytes.
- * @param int       $at                Where to start scanning.
- * @param int       $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
- * @param int|null  $max_bytes         Stop scanning after this many bytes have been seen.
- * @param int|null  $max_code_points   Stop scanning after this many code points have been seen.
- * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
- * @return int How many code points were successfully scanned.
- */
-function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
-	$byte_length       = strlen( $bytes );
-	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
-	$invalid_length    = 0;
-	$count             = 0;
-	$max_count         = $max_code_points ?? PHP_INT_MAX;
-	$has_noncharacters = false;
-
-	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
-		/*
-		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
-		 *
-		 * This optimization step improves the speed from 10x to 100x
-		 * depending on whether the JIT has optimized the function.
-		 */
-		$ascii_byte_count = strspn(
-			$bytes,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$i,
-			$end - $i
-		);
-
-		if ( $count + $ascii_byte_count >= $max_count ) {
-			$at    = $i + ( $max_count - $count );
-			$count = $max_count;
-			return $count;
-		}
-
-		$count += $ascii_byte_count;
-		$i     += $ascii_byte_count;
-
-		if ( $i >= $end ) {
-			$at = $end;
-			return $count;
-		}
-
-		/**
-		 * The above fast-track handled all single-byte UTF-8 characters. What
-		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
-		 *
-		 * Therefore everything past here is checking those multibyte sequences.
-		 *
-		 * It may look like there’s a need to check against the max bytes here,
-		 * but since each match of a single character returns, this functions will
-		 * bail already if crossing the max-bytes threshold. This function SHALL
-		 * NOT return in the middle of a multi-byte character, so if a character
-		 * falls on each side of the max bytes, the entire character will be scanned.
-		 *
-		 * Because it’s possible that there are truncated characters, the use of
-		 * the null-coalescing operator with "\xC0" is a convenience for skipping
-		 * length checks on every continuation bytes. This works because 0xC0 is
-		 * always invalid in a UTF-8 string, meaning that if the string has been
-		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
-		 *
-		 * > [The following table] lists all of the byte sequences that are well-formed
-		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
-		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
-		 * > outside of the ranges listed is ill-formed.
-		 *
-		 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
-		 *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
-		 *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
-		 *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
-		 *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
-		 *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
-		 *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
-		 *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
-		 *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
-		 *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
-		 *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
-		 *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
-		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
-		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
-		 *
-		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
-		 */
-
-		// Valid two-byte code points.
-		$b1 = ord( $bytes[ $i ] );
-		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
-
-		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
-			++$count;
-			++$i;
-			continue;
-		}
-
-		// Valid three-byte code points.
-		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
-
-		if ( $b3 < 0x80 || $b3 > 0xBF ) {
-			goto invalid_utf8;
-		}
-
-		if (
-			( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
-			( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
-			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
-		) {
-			++$count;
-			$i += 2;
-
-			// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
-			if ( 0xEF === $b1 ) {
-				$has_noncharacters |= (
-					( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
-					( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
-				);
-			}
-
-			continue;
-		}
-
-		// Valid four-byte code points.
-		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
-
-		if ( $b4 < 0x80 || $b4 > 0xBF ) {
-			goto invalid_utf8;
-		}
-
-		if (
-			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
-			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
-		) {
-			++$count;
-			$i += 3;
-
-			// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
-			$has_noncharacters |= (
-				( 0x0F === ( $b2 & 0x0F ) ) &&
-				0xBF === $b3 &&
-				( 0xBE === $b4 || 0xBF === $b4 )
-			);
-
-			continue;
-		}
-
-		/**
-		 * When encountering invalid byte sequences, Unicode suggests finding the
-		 * maximal subpart of a text and replacing that subpart with a single
-		 * replacement character.
-		 *
-		 * > This practice is more secure because it does not result in the
-		 * > conversion consuming parts of valid sequences as though they were
-		 * > invalid. It also guarantees at least one replacement character will
-		 * > occur for each instance of an invalid sequence in the original text.
-		 * > Furthermore, this practice can be defined consistently for better
-		 * > interoperability between different implementations of conversion.
-		 *
-		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
-		 */
-		invalid_utf8:
-		$at             = $i;
-		$invalid_length = 1;
-
-		// Single-byte and two-byte characters.
-		if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
-			return $count;
-		}
-
-		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
-		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
-
-		// Find the maximal subpart and skip past it.
-		if ( 0xE0 === ( $b1 & 0xF0 ) ) {
-			// Three-byte characters.
-			$b2_valid = (
-				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
-				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
-				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
-			);
-
-			$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
-			return $count;
-		} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
-			// Four-byte characters.
-			$b2_valid = (
-				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
-				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
-			);
-
-			$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
-
-			$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
-			return $count;
-		}
-
-		return $count;
-	}
-
-	$at = $i;
-	return $count;
-}
-
-/**
- * Fallback mechanism for safely validating UTF-8 bytes.
- *
- * @since 6.9.0
- * @access private
- *
- * @see wp_is_valid_utf8()
- *
- * @param string $bytes String which might contain text encoded as UTF-8.
- * @return bool Whether the provided bytes can decode as valid UTF-8.
- */
-function _wp_is_valid_utf8_fallback( string $bytes ): bool {
-	$bytes_length = strlen( $bytes );
-	if ( 0 === $bytes_length ) {
-		return true;
-	}
-
-	$next_byte_at   = 0;
-	$invalid_length = 0;
-
-	_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
-
-	return $bytes_length === $next_byte_at && 0 === $invalid_length;
-}
-
-/**
- * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
- *
- * Example:
- *
- *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
- *
- * @since 6.9.0
- * @access private
- *
- * @see wp_scrub_utf8()
- *
- * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
- * @return string Input string with spans of invalid bytes swapped with the replacement character.
- */
-function _wp_scrub_utf8_fallback( string $bytes ): string {
-	$bytes_length   = strlen( $bytes );
-	$next_byte_at   = 0;
-	$was_at         = 0;
-	$invalid_length = 0;
-	$scrubbed       = '';
-
-	while ( $next_byte_at <= $bytes_length ) {
-		_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
-
-		if ( $next_byte_at >= $bytes_length ) {
-			if ( 0 === $was_at ) {
-				return $bytes;
-			}
-
-			return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
-		}
-
-		$scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
-		$scrubbed .= "\u{FFFD}";
-
-		$next_byte_at += $invalid_length;
-		$was_at        = $next_byte_at;
-	}
-
-	return $scrubbed;
-}
-
-/**
- * Returns how many code points are found in the given UTF-8 string.
- *
- * Invalid spans of bytes count as a single code point according
- * to the maximal subpart rule. This function is a fallback method
- * for calling `mb_strlen( $text, 'UTF-8' )`.
- *
- * When negative values are provided for the byte offsets or length,
- * this will always report zero code points.
- *
- * Example:
- *
- *     4  === _wp_utf8_codepoint_count( 'text' );
- *
- *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
- *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
- *
- * @since 6.9.0
- * @access private
- *
- * @param string $text            Count code points in this string.
- * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
- * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
- *                                Default is to scan until the end of the string. Must be positive.
- * @return int How many code points were found.
- */
-function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
-	if ( $byte_offset < 0 ) {
-		return 0;
-	}
-
-	$count           = 0;
-	$at              = $byte_offset;
-	$end             = strlen( $text );
-	$invalid_length  = 0;
-	$max_byte_length = min( $end - $at, $max_byte_length );
-
-	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
-		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
-		$count += $invalid_length > 0 ? 1 : 0;
-		$at    += $invalid_length;
-	}
-
-	return $count;
-}
-
-/**
- * Given a starting offset within a string and a maximum number of code points,
- * return how many bytes are occupied by the span of characters.
- *
- * Invalid spans of bytes count as a single code point according to the maximal
- * subpart rule. This function is a fallback method for calling
- * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
- *
- * @since 6.9.0
- * @access private
- *
- * @param string $text              Count bytes of span in this text.
- * @param int    $byte_offset       Start counting at this byte offset.
- * @param int    $max_code_points   Stop counting after this many code points have been seen,
- *                                  or at the end of the string.
- * @param ?int   $found_code_points Optional. Will be set to number of found code points in
- *                                  span, as this might be smaller than the maximum count if
- *                                  the string is not long enough.
- * @return int Number of bytes spanned by the code points.
- */
-function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
-	$was_at            = $byte_offset;
-	$invalid_length    = 0;
-	$end               = strlen( $text );
-	$found_code_points = 0;
-
-	while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
-		$needed      = $max_code_points - $found_code_points;
-		$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
-
-		$found_code_points += $chunk_count;
-
-		// Invalid spans only convey one code point count regardless of how long they are.
-		if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
-			++$found_code_points;
-			$byte_offset += $invalid_length;
-		}
-	}
-
-	return $byte_offset - $was_at;
-}
-
-/**
- * Fallback support for determining if a string contains Unicode noncharacters.
- *
- * @since 6.9.0
- * @access private
- *
- * @see \wp_has_noncharacters()
- *
- * @param string $text Are there noncharacters in this string?
- * @return bool Whether noncharacters were found in the string.
- */
-function _wp_has_noncharacters_fallback( string $text ): bool {
-	$at                = 0;
-	$invalid_length    = 0;
-	$has_noncharacters = false;
-	$end               = strlen( $text );
-
-	while ( $at < $end && ! $has_noncharacters ) {
-		_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
-		$at += $invalid_length;
-	}
-
-	return $has_noncharacters;
-}
-
-/**
- * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
- * with the deprecated function from the PHP standard library.
- *
- * @since 6.9.0
- * @access private
- *
- * @see \utf8_encode()
- *
- * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
- * @return string Text converted into UTF-8.
- */
-function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
-	$iso_8859_1_text = (string) $iso_8859_1_text;
-	$at              = 0;
-	$was_at          = 0;
-	$end             = strlen( $iso_8859_1_text );
-	$utf8            = '';
-
-	while ( $at < $end ) {
-		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
-		$ascii_byte_count = strspn(
-			$iso_8859_1_text,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$at
-		);
-
-		if ( $ascii_byte_count > 0 ) {
-			$at += $ascii_byte_count;
-			continue;
-		}
-
-		// All other bytes transform into two-byte UTF-8 sequences.
-		$code_point = ord( $iso_8859_1_text[ $at ] );
-		$byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
-		$byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
-
-		$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
-		$utf8 .= "{$byte1}{$byte2}";
-
-		++$at;
-		$was_at = $at;
-	}
-
-	if ( 0 === $was_at ) {
-		return $iso_8859_1_text;
-	}
-
-	$utf8 .= substr( $iso_8859_1_text, $was_at );
-	return $utf8;
-}
-
-/**
- * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
- * with the deprecated function from the PHP standard library.
- *
- * @since 6.9.0
- * @access private
- *
- * @see utf8_decode()
- *
- * @param string $utf8_text Text treated as UTF-8 bytes.
- * @return string Text converted into ISO-8859-1.
- */
-function _wp_utf8_decode_fallback( $utf8_text ) {
-	$utf8_text       = (string) $utf8_text;
-	$at              = 0;
-	$was_at          = 0;
-	$end             = strlen( $utf8_text );
-	$iso_8859_1_text = '';
-
-	while ( $at < $end ) {
-		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
-		$ascii_byte_count = strspn(
-			$utf8_text,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$at
-		);
-
-		if ( $ascii_byte_count > 0 ) {
-			$at += $ascii_byte_count;
-			continue;
-		}
-
-		$next_at        = $at;
-		$invalid_length = 0;
-		$found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
-		$span_length    = $next_at - $at;
-		$next_byte      = '?';
-
-		if ( 1 !== $found ) {
-			if ( $invalid_length > 0 ) {
-				$next_byte = '';
-				goto flush_sub_part;
-			}
-
-			break;
-		}
-
-		// All convertible code points are two-bytes long.
-		$byte1 = ord( $utf8_text[ $at ] );
-		if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
-			goto flush_sub_part;
-		}
-
-		// All convertible code points are not greater than U+FF.
-		$byte2      = ord( $utf8_text[ $at + 1 ] );
-		$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
-		if ( $code_point > 0xFF ) {
-			goto flush_sub_part;
-		}
-
-		$next_byte = chr( $code_point );
-
-		flush_sub_part:
-		$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
-		$iso_8859_1_text .= $next_byte;
-		$at              += $span_length;
-		$was_at           = $at;
-
-		if ( $invalid_length > 0 ) {
-			$iso_8859_1_text .= '?';
-			$at              += $invalid_length;
-			$was_at           = $at;
-		}
-	}
-
-	if ( 0 === $was_at ) {
-		return $utf8_text;
-	}
-
-	$iso_8859_1_text .= substr( $utf8_text, $was_at );
-	return $iso_8859_1_text;
-}
-
-/**
- * Indicates if a given byte stream represents valid UTF-8.
- *
- * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
- *
- * Example:
- *
- *     true  === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
- *
- *     false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
- *     12    === $error_at;
- *
- *     false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
- *     23    === $error_at;
- *
- *     false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
- *     15    === $error_at;
- *
- * @param  string   $bytes  Text to validate as UTF-8 bytes.
- * @param  int      $starting_byte  Byte offset in string where decoding should begin.
- * @param  int|null $first_error_byte_at  Optional. If provided and byte stream fails to validate,
- *                                     will be set to the byte offset where the first invalid
- *                                     byte appeared. Otherwise, will not be set.
- *
- * @return bool Whether the given byte stream represents valid UTF-8.
- * @since {WP_VERSION}
- */
-function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, ?int &$first_error_byte_at = null ): bool {
-	$state         = UTF8_DECODER_ACCEPT;
-	$last_start_at = $starting_byte;
-
-	for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			$last_start_at = $at;
-		}
-
-		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
-	}
-
-	if ( UTF8_DECODER_ACCEPT === $state ) {
-		return true;
-	} else {
-		$first_error_byte_at = $last_start_at;
-
-		return false;
-	}
-}
-
-/**
- * Returns number of code points found within a UTF-8 string, similar to `strlen()`.
- *
- * If the byte stream fails to properly decode as UTF-8 this function will set the
- * byte index of the first error byte and report the number of decoded code points.
- *
- * @param  string   $bytes  Text for which to count code points.
- * @param  int|null $first_error_byte_at  Optional. If provided, will be set upon finding
- *                                     the first invalid byte.
- *
- * @return int How many code points were decoded in the given byte stream before an error
- *             or before reaching the end of the string.
- * @since {WP_VERSION}
- */
-function utf8_codepoint_count( string $bytes, ?int &$first_error_byte_at = null ): int {
-	$state         = UTF8_DECODER_ACCEPT;
-	$last_start_at = 0;
-	$count         = 0;
-	$codepoint     = 0;
-
-	for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			$last_start_at = $at;
-		}
-
-		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint );
-
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			++$count;
-		}
-	}
-
-	if ( UTF8_DECODER_ACCEPT !== $state ) {
-		$first_error_byte_at = $last_start_at;
-	}
-
-	return $count;
-}
-
-/**
- * Inner loop for a number of UTF-8 decoding-related functions.
- *
- * You probably don't need this! This is highly-specific and optimized
- * code for UTF-8 operations used in other functions.
- *
- * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
- *
- * @since {WP_VERSION}
- *
- * @access private
- *
- * @param  string   $byte  Next byte to be applied in UTF-8 decoding or validation.
- * @param  int      $state  UTF-8 decoding state, one of the following values:<br><ul>
- *                                 <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
- *                                 <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
- *                                 Any other positive value: Decoder is waiting for additional bytes.
- * @param  int|null $codepoint  Optional. If provided, will accumulate the decoded code point as
- *                            each byte is processed. If not provided or unable to decode, will
- *                            not be set, or will be set to invalid and unusable data.
- *
- * @return int Next decoder state after processing the current byte.
- */
-function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 ): int {
-	/**
-	 * State classification and transition table for UTF-8 validation.
-	 *
-	 * > The first part of the table maps bytes to character classes that
-	 * > to reduce the size of the transition table and create bitmasks.
-	 * >
-	 * > The second part is a transition table that maps a combination
-	 * > of a state of the automaton and a character class to a state.
-	 *
-	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-	 */
-	static $state_table = (
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
-		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
-		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
-		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
-		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
-		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
-		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
-		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
-		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
-		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
-	);
-
-	$byte      = ord( $byte );
-	$type      = ord( $state_table[ $byte ] );
-	$codepoint = ( UTF8_DECODER_ACCEPT === $state )
-		? ( ( 0xFF >> $type ) & $byte )
-		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
-
-	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
-}
-
-/**
- * Extract a slice of a text by code point, where invalid byte sequences count
- * as a single code point, U+FFFD (the Unicode replacement character `�`).
- *
- * This function does not permit passing negative indices and will return
- * the original string if such are provide.
- *
- * @param  string $text  Input text from which to extract.
- * @param  int    $from  Start extracting after this many code-points.
- * @param  int    $length  Extract this many code points.
- *
- * @return string Extracted slice of input string.
- */
-function utf8_substr( string $text, int $from = 0, ?int $length = null ): string {
-	if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
-		return $text;
-	}
-
-	$position_in_input = 0;
-	$codepoint_at      = 0;
-	$end_byte          = strlen( $text );
-	$buffer            = '';
-	$seen_codepoints   = 0;
-	$sliced_codepoints = 0;
-	$decoder_state     = UTF8_DECODER_ACCEPT;
-
-	// Get to the start of the string.
-	while ( $position_in_input < $end_byte && $seen_codepoints < $length ) {
-		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
-
-		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
-			++$position_in_input;
-
-			if ( $seen_codepoints >= $from ) {
-				++$sliced_codepoints;
-				$buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at );
-			}
-
-			++$seen_codepoints;
-			$codepoint_at = $position_in_input;
-		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
-			// "\u{FFFD}" is not supported in PHP 5.6.
-			$buffer .= "\xEF\xBF\xBD";
-
-			// Skip to the start of the next code point.
-			while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
-				$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
-			}
-
-			++$seen_codepoints;
-			$codepoint_at  = $position_in_input;
-			$decoder_state = UTF8_DECODER_ACCEPT;
-		} else {
-			++$position_in_input;
-		}
-	}
-
-	return $buffer;
-}
-
 /**
  * Extract a unicode codepoint from a specific offset in text.
  * Invalid byte sequences count as a single code point, U+FFFD
@@ -809,32 +29,13 @@ function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes
 		return null;
 	}
 
-	$position_in_input = $byte_offset;
-	$codepoint_at      = $byte_offset;
-	$end_byte          = strlen( $text );
-	$codepoint         = null;
-	$decoder_state     = UTF8_DECODER_ACCEPT;
-
-	// Get to the start of the string.
-	while ( $position_in_input < $end_byte ) {
-		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
-
-		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
-			++$position_in_input;
-			$codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) );
-			break;
-		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
-			// "\u{FFFD}" is not supported in PHP 5.6.
-			$codepoint = utf8_ord( "\xEF\xBF\xBD" );
-			break;
-		} else {
-			++$position_in_input;
-		}
+	$new_byte_offset = $byte_offset;
+	if( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
+		return utf8_ord( "\u{FFFD}" );
 	}
 
-	$matched_bytes = $position_in_input - $byte_offset;
-
-	return $codepoint;
+	$matched_bytes = $new_byte_offset - $byte_offset;
+	return utf8_ord( substr( $text, $byte_offset, $matched_bytes ) );
 }
 
 /**
diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
new file mode 100644
index 00000000..273d1900
--- /dev/null
+++ b/components/Encoding/utf8.php
@@ -0,0 +1,179 @@
+<?php
+
+namespace WordPress\Encoding;
+
+if ( extension_loaded( 'mbstring' ) ) :
+	/**
+	 * Determines if a given byte string represents a valid UTF-8 encoding.
+	 *
+	 * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
+	 * it is still possible. Many texts are simultaneously valid UTF-8,
+	 * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
+	 *
+	 * Example:
+	 *
+	 *     true === wp_is_valid_utf8( '' );
+	 *     true === wp_is_valid_utf8( 'just a test' );
+	 *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
+	 *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
+	 *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
+	 *
+	 *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
+	 *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
+	 *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
+	 *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
+	 *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
+	 *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
+	 *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
+	 *
+	 *  A “valid” string consists of “well-formed UTF-8 code unit sequence[s],” meaning
+	 *  that the bytes conform to the UTF-8 encoding scheme, all characters use the minimal
+	 *  byte sequence required by UTF-8, and that no sequence encodes a UTF-16 surrogate
+	 *  code point or any character above the representable range.
+	 *
+	 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G32860
+	 *
+	 * @since 6.9.0
+	 *
+	 * @param string $bytes String which might contain text encoded as UTF-8.
+	 * @return bool Whether the provided bytes can decode as valid UTF-8.
+	 */
+	function wp_is_valid_utf8( string $bytes ): bool {
+		return mb_check_encoding( $bytes, 'UTF-8' );
+	}
+else :
+	/**
+	 * Fallback function for validating UTF-8.
+	 *
+	 * @ignore
+	 * @private
+	 *
+	 * @since 6.9.0
+	 */
+	function wp_is_valid_utf8( string $string ): bool {
+		return _wp_is_valid_utf8_fallback( $string );
+	}
+endif;
+
+if (
+	extension_loaded( 'mbstring' ) &&
+	// Maximal subpart substitution introduced by php/php-src@04e59c916f12b322ac55f22314e31bd0176d01cb.
+	version_compare( PHP_VERSION, '8.1.6', '>=' )
+) :
+	/**
+	 * Replaces ill-formed UTF-8 byte sequences with the Unicode Replacement Character.
+	 *
+	 * Knowing what to do in the presence of text encoding issues can be complicated.
+	 * This function replaces invalid spans of bytes to neutralize any corruption that
+	 * may be there and prevent it from causing further problems downstream.
+	 *
+	 * However, it’s not always ideal to replace those bytes. In some settings it may
+	 * be best to leave the invalid bytes in the string so that downstream code can handle
+	 * them in a specific way. Replacing the bytes too early, like escaping for HTML too
+	 * early, can introduce other forms of corruption and data loss.
+	 *
+	 * When in doubt, use this function to replace spans of invalid bytes.
+	 *
+	 * Replacement follows the “maximal subpart” algorithm for secure and interoperable
+	 * strings. This can lead to sequences of multiple replacement characters in a row.
+	 *
+	 * Example:
+	 *
+	 *     // Valid strings come through unchanged.
+	 *     'test' === wp_scrub_utf8( 'test' );
+	 *
+	 *     // Invalid sequences of bytes are replaced.
+	 *     $invalid = "the byte \xC0 is never allowed in a UTF-8 string.";
+	 *     "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_scrub_utf8( $invalid, true );
+	 *     'the byte � is never allowed in a UTF-8 string.' === wp_scrub_utf8( $invalid, true );
+	 *
+	 *     // Maximal subparts are replaced individually.
+	 *     '.�.' === wp_scrub_utf8( ".\xC0." );              // C0 is never valid.
+	 *     '.�.' === wp_scrub_utf8( ".\xE2\x8C." );          // Missing A3 at end.
+	 *     '.��.' === wp_scrub_utf8( ".\xE2\x8C\xE2\x8C." ); // Maximal subparts replaced separately.
+	 *     '.��.' === wp_scrub_utf8( ".\xC1\xBF." );         // Overlong sequence.
+	 *     '.���.' === wp_scrub_utf8( ".\xED\xA0\x80." );    // Surrogate half.
+	 *
+	 * Note! The Unicode Replacement Character is itself a Unicode character (U+FFFD).
+	 * Once a span of invalid bytes has been replaced by one, it will not be possible
+	 * to know whether the replacement character was originally intended to be there
+	 * or if it is the result of scrubbing bytes. It is ideal to leave replacement for
+	 * display only, but some contexts (e.g. generating XML or passing data into a
+	 * large language model) require valid input strings.
+	 *
+	 * @since 6.9.0
+	 *
+	 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
+	 *
+	 * @param string $text String which is assumed to be UTF-8 but may contain invalid sequences of bytes.
+	 * @return string Input text with invalid sequences of bytes replaced with the Unicode replacement character.
+	 */
+	function wp_scrub_utf8( $text ) {
+		/*
+		 * While it looks like setting the substitute character could fail,
+		 * the internal PHP code will never fail when provided a valid
+		 * code point as a number. In this case, there’s no need to check
+		 * its return value to see if it succeeded.
+		 */
+		$prev_replacement_character = mb_substitute_character();
+		mb_substitute_character( 0xFFFD );
+		$scrubbed = mb_scrub( $text, 'UTF-8' );
+		mb_substitute_character( $prev_replacement_character );
+
+		return $scrubbed;
+	}
+else :
+	/**
+	 * Fallback function for scrubbing UTF-8.
+	 *
+	 * @ignore
+	 * @private
+	 *
+	 * @since 6.9.0
+	 */
+	function wp_scrub_utf8( $text ) {
+		return _wp_scrub_utf8_fallback( $text );
+	}
+endif;
+
+if ( _wp_can_use_pcre_u() ) :
+	/**
+	 * Returns whether the given string contains Unicode noncharacters.
+	 *
+	 * XML recommends against using noncharacters and HTML forbids their
+	 * use in attribute names. Unicode recommends that they not be used
+	 * in open exchange of data.
+	 *
+	 * Noncharacters are code points within the following ranges:
+	 *  - U+FDD0–U+FDEF
+	 *  - U+FFFE–U+FFFF
+	 *  - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
+	 *
+	 * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
+	 * @see https://www.w3.org/TR/xml/#charsets
+	 * @see https://html.spec.whatwg.org/#attributes-2
+	 *
+	 * @since 6.9.0
+	 *
+	 * @param string $text Are there noncharacters in this string?
+	 * @return bool Whether noncharacters were found in the string.
+	 */
+	function wp_has_noncharacters( string $text ): bool {
+		return 1 === preg_match(
+			'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
+			$text
+		);
+	}
+else :
+	/**
+	 * Fallback function for detecting noncharacters in a text.
+	 *
+	 * @ignore
+	 * @private
+	 *
+	 * @since 6.9.0
+	 */
+	function wp_has_noncharacters( string $text ): bool {
+		return _wp_has_noncharacters_fallback( $text );
+	}
+endif;

From 196bd4e177432af62f7388a2ef0981c857e4d5f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Fri, 31 Oct 2025 13:47:30 +0100
Subject: [PATCH 02/28] Update composer.json files map

---
 composer.json | 212 +++++++++++++++++++++++++-------------------------
 1 file changed, 107 insertions(+), 105 deletions(-)

diff --git a/composer.json b/composer.json
index ba5ce8a6..fb26426d 100644
--- a/composer.json
+++ b/composer.json
@@ -1,107 +1,109 @@
 {
-    "name": "wp-php-toolkit/php-toolkit",
-    "type": "library",
-    "description": "WordPress Components",
-    "keywords": [
-        "wordpress",
-        "components"
-    ],
-    "homepage": "https://wordpress.org",
-    "license": "GPL-2.0-or-later",
-    "authors": [
-        {
-            "name": "Adam Zielinski",
-            "email": "adam@adamziel.com"
-        },
-        {
-            "name": "WordPress Contributors"
-        }
-    ],
-    "require": {
-        "php": ">=7.2",
-        "ext-json": "*",
-        "ext-mbstring": "*"
-    },
-    "require-dev": {
-        "yoast/phpunit-polyfills": "2.0.0",
-        "squizlabs/php_codesniffer": "^3.13.4",
-        "phpcompatibility/php-compatibility": "^9.3.5",
-        "slevomat/coding-standard": "^8.21.1",
-        "wp-coding-standards/wpcs": "^3.2.0",
-        "phpunit/phpunit": "^9.5",
-        "phpstan/phpstan": "^1.0"
-    },
-    "autoload": {
-        "exclude-from-classmap": [
-            "**/Tests/",
-            "**/bin/",
-            "/Tests/"
-        ],
-        "classmap": [
-            "components/BlockParser/",
-            "components/Blueprints/",
-            "components/Blueprints/vendor-patched/",
-            "components/CLI/",
-            "components/DataLiberation/",
-            "components/DataLiberation/vendor-patched/",
-            "components/Filesystem/",
-            "components/Git/",
-            "components/HTML/./",
-            "components/HttpClient/",
-            "components/HttpServer/",
-            "components/Markdown/",
-            "components/Markdown/vendor-patched",
-            "components/Merge/",
-            "components/Merge/vendor-patched",
-            "components/ByteStream/",
-            "components/ToolkitCodingStandards/",
-            "components/XML/",
-            "components/Zip/"
-        ],
-        "files": [
-            "components/DataLiberation/URL/functions.php",
-            "components/Encoding/utf8-decoder.php",
-            "components/Encoding/utf8-encoder.php",
-            "components/Filesystem/functions.php",
-            "components/Zip/functions.php",
-            "components/Polyfill/wordpress.php",
-            "components/Polyfill/mbstring.php",
-            "components/Polyfill/php-functions.php",
-            "components/Git/functions.php"
-        ],
-        "psr-4": {
-            "Rowbot\\": "components/DataLiberation/vendor-patched/",
-            "Brick\\": "components/DataLiberation/vendor-patched/",
-            "WordPress\\CORSProxy\\": "components/CORSProxy/"
-        }
-    },
-    "scripts": {
-        "build-php-toolkit-phar": "bash bin/build-libraries-phar.sh",
-        "build-blueprints-phar": "box compile -c phar-blueprints.json",
-        "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts",
-        "test": "phpunit -c phpunit.xml",
-        "lint": "phpcs -d memory_limit=1G .",
-        "lint-fix": "phpcbf -d memory_limit=1G ."
-    },
-    "repositories": [
-        {
-            "type": "path",
-            "url": "components/*",
-            "options": {
-                "symlink": true
-            }
-        }
-    ],
-    "minimum-stability": "dev",
-    "config": {
-        "allow-plugins": {
-            "dealerdirect/phpcodesniffer-composer-installer": true
-        }
-    },
-    "archive": {
-        "exclude": [
-            "/plugins",
-            "/examples"
-        ]
-    }
+	"name": "wp-php-toolkit/php-toolkit",
+	"type": "library",
+	"description": "WordPress Components",
+	"keywords": [
+		"wordpress",
+		"components"
+	],
+	"homepage": "https://wordpress.org",
+	"license": "GPL-2.0-or-later",
+	"authors": [
+		{
+			"name": "Adam Zielinski",
+			"email": "adam@adamziel.com"
+		},
+		{
+			"name": "WordPress Contributors"
+		}
+	],
+	"require": {
+		"php": ">=7.2",
+		"ext-json": "*",
+		"ext-mbstring": "*"
+	},
+	"require-dev": {
+		"yoast/phpunit-polyfills": "2.0.0",
+		"squizlabs/php_codesniffer": "^3.13.4",
+		"phpcompatibility/php-compatibility": "^9.3.5",
+		"slevomat/coding-standard": "^8.21.1",
+		"wp-coding-standards/wpcs": "^3.2.0",
+		"phpunit/phpunit": "^9.5",
+		"phpstan/phpstan": "^1.0"
+	},
+	"autoload": {
+		"exclude-from-classmap": [
+			"**/Tests/",
+			"**/bin/",
+			"/Tests/"
+		],
+		"classmap": [
+			"components/BlockParser/",
+			"components/Blueprints/",
+			"components/Blueprints/vendor-patched/",
+			"components/CLI/",
+			"components/DataLiberation/",
+			"components/DataLiberation/vendor-patched/",
+			"components/Filesystem/",
+			"components/Git/",
+			"components/HTML/./",
+			"components/HttpClient/",
+			"components/HttpServer/",
+			"components/Markdown/",
+			"components/Markdown/vendor-patched",
+			"components/Merge/",
+			"components/Merge/vendor-patched",
+			"components/ByteStream/",
+			"components/ToolkitCodingStandards/",
+			"components/XML/",
+			"components/Zip/"
+		],
+		"files": [
+			"components/DataLiberation/URL/functions.php",
+			"components/Encoding/utf8.php",
+			"components/Encoding/compat-utf8.php",
+			"components/Encoding/utf8-decoder.php",
+			"components/Encoding/utf8-encoder.php",
+			"components/Filesystem/functions.php",
+			"components/Zip/functions.php",
+			"components/Polyfill/wordpress.php",
+			"components/Polyfill/mbstring.php",
+			"components/Polyfill/php-functions.php",
+			"components/Git/functions.php"
+		],
+		"psr-4": {
+			"Rowbot\\": "components/DataLiberation/vendor-patched/",
+			"Brick\\": "components/DataLiberation/vendor-patched/",
+			"WordPress\\CORSProxy\\": "components/CORSProxy/"
+		}
+	},
+	"scripts": {
+		"build-php-toolkit-phar": "bash bin/build-libraries-phar.sh",
+		"build-blueprints-phar": "box compile -c phar-blueprints.json",
+		"regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts",
+		"test": "phpunit -c phpunit.xml",
+		"lint": "phpcs -d memory_limit=1G .",
+		"lint-fix": "phpcbf -d memory_limit=1G ."
+	},
+	"repositories": [
+		{
+			"type": "path",
+			"url": "components/*",
+			"options": {
+				"symlink": true
+			}
+		}
+	],
+	"minimum-stability": "dev",
+	"config": {
+		"allow-plugins": {
+			"dealerdirect/phpcodesniffer-composer-installer": true
+		}
+	},
+	"archive": {
+		"exclude": [
+			"/plugins",
+			"/examples"
+		]
+	}
 }

From f82b874679203019965a224e4e0c8482f9dddc75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 00:49:54 +0100
Subject: [PATCH 03/28] polyfill _wp_can_use_pcre_u if missing

---
 components/Encoding/utf8.php | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index 273d1900..d5f06b1f 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -136,6 +136,21 @@ function wp_scrub_utf8( $text ) {
 	}
 endif;
 
+function _wp_can_use_pcre_u( $set = null ) {
+	static $utf8_pcre = 'reset';
+
+	if ( null !== $set ) {
+		$utf8_pcre = $set;
+	}
+
+	if ( 'reset' === $utf8_pcre ) {
+		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- intentional error generated to detect PCRE/u support.
+		$utf8_pcre = @preg_match( '/^./u', 'a' );
+	}
+
+	return $utf8_pcre;
+}
+
 if ( _wp_can_use_pcre_u() ) :
 	/**
 	 * Returns whether the given string contains Unicode noncharacters.

From 840d61d0d376aac3973a75261b462119deb73728 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 00:56:48 +0100
Subject: [PATCH 04/28] use different namespaces for different files in case
 thats why composer wont autoload them

---
 components/Encoding/compat-utf8.php  |   2 +-
 components/Encoding/composer.json    |   1 -
 components/Encoding/utf8-decoder.php |  67 -------------
 components/Encoding/utf8-encoder.php |  66 -------------
 components/Encoding/utf8.php         | 136 +++++++++++++++++++++++++++
 composer.json                        |   1 -
 6 files changed, 137 insertions(+), 136 deletions(-)
 delete mode 100644 components/Encoding/utf8-decoder.php

diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php
index ab6e8cd0..89dafb5c 100644
--- a/components/Encoding/compat-utf8.php
+++ b/components/Encoding/compat-utf8.php
@@ -1,6 +1,6 @@
 <?php
 
-namespace WordPress\Encoding;
+namespace WordPress\Encoding\compat;
 
 /**
  * Finds spans of valid and invalid UTF-8 bytes in a given string.
diff --git a/components/Encoding/composer.json b/components/Encoding/composer.json
index eba3afd0..72058a5c 100644
--- a/components/Encoding/composer.json
+++ b/components/Encoding/composer.json
@@ -20,7 +20,6 @@
 		"files": [
 			"utf8.php",
 			"compat-utf8.php",
-			"utf8-decoder.php",
 			"utf8-encoder.php"
 		],
 		"exclude-from-classmap": [
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
deleted file mode 100644
index 74e685e3..00000000
--- a/components/Encoding/utf8-decoder.php
+++ /dev/null
@@ -1,67 +0,0 @@
-<?php
-
-namespace WordPress\Encoding;
-
-/*
- * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
- * proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
- *
- * It enables parsing XML documents with incomplete UTF-8 byte sequences
- * without crashing or depending on the mbstring extension.
- */
-
-/**
- * Extract a unicode codepoint from a specific offset in text.
- * Invalid byte sequences count as a single code point, U+FFFD
- * (the Unicode replacement character ``).
- *
- * This function does not permit passing negative indices and will return
- * null if such are provided.
- *
- * @param  string $text  Input text from which to extract.
- * @param  int    $byte_offset  Start at this byte offset in the input text.
- * @param  int    $matched_bytes  How many bytes were matched to produce the codepoint.
- *
- * @return int Unicode codepoint.
- */
-function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
-	if ( $byte_offset < 0 ) {
-		return null;
-	}
-
-	$new_byte_offset = $byte_offset;
-	if( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
-		return utf8_ord( "\u{FFFD}" );
-	}
-
-	$matched_bytes = $new_byte_offset - $byte_offset;
-	return utf8_ord( substr( $text, $byte_offset, $matched_bytes ) );
-}
-
-/**
- * Convert a UTF-8 byte sequence to its Unicode codepoint.
- *
- * @param  string $character  UTF-8 encoded byte sequence representing a single Unicode character.
- *
- * @return int Unicode codepoint.
- */
-function utf8_ord( string $character ): int {
-	// Convert the byte sequence to its binary representation.
-	$bytes = unpack( 'C*', $character );
-
-	// Initialize the codepoint.
-	$codepoint = 0;
-
-	// Calculate the codepoint based on the number of bytes.
-	if ( 1 === count( $bytes ) ) {
-		$codepoint = $bytes[1];
-	} elseif ( 2 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
-	} elseif ( 3 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
-	} elseif ( 4 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
-	}
-
-	return $codepoint;
-}
diff --git a/components/Encoding/utf8-encoder.php b/components/Encoding/utf8-encoder.php
index 3406b1ea..7cd7b287 100644
--- a/components/Encoding/utf8-encoder.php
+++ b/components/Encoding/utf8-encoder.php
@@ -1,69 +1,3 @@
 <?php
 
 namespace WordPress\Encoding;
-
-/**
- * UTF-8 encoding pipeline by Dennis Snell (@dmsnell).
- *
- * It enables parsing XML documents with incomplete UTF-8 byte sequences
- * without crashing or depending on the mbstring extension.
- */
-
-/**
- * Encode a code point number into the UTF-8 encoding.
- *
- * This encoder implements the UTF-8 encoding algorithm for converting
- * a code point into a byte sequence. If it receives an invalid code
- * point it will return the Unicode Replacement Character U+FFFD `�`.
- *
- * Example:
- *
- *     '🅰' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0x1f170 );
- *
- *     // Half of a surrogate pair is an invalid code point.
- *     '�' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0xd83c );
- *
- * @since 6.6.0
- *
- * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
- *
- * @param int $codepoint Which code point to convert.
- * @return string Converted code point, or `�` if invalid.
- */
-function codepoint_to_utf8_bytes( $codepoint ) {
-	// Pre-check to ensure a valid code point.
-	if (
-		$codepoint <= 0 ||
-		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
-		$codepoint > 0x10FFFF
-	) {
-		return '�';
-	}
-
-	if ( $codepoint <= 0x7F ) {
-		return chr( $codepoint );
-	}
-
-	if ( $codepoint <= 0x7FF ) {
-		$byte1 = chr( ( 0xC0 | ( ( $codepoint >> 6 ) & 0x1F ) ) );
-		$byte2 = chr( $codepoint & 0x3F | 0x80 );
-
-		return "{$byte1}{$byte2}";
-	}
-
-	if ( $codepoint <= 0xFFFF ) {
-		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
-		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
-		$byte3 = chr( $codepoint & 0x3F | 0x80 );
-
-		return "{$byte1}{$byte2}{$byte3}";
-	}
-
-	// Any values above U+10FFFF are eliminated above in the pre-check.
-	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
-	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
-	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
-	$byte4 = chr( $codepoint & 0x3F | 0x80 );
-
-	return "{$byte1}{$byte2}{$byte3}{$byte4}";
-}
diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index d5f06b1f..034e4228 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -2,6 +2,11 @@
 
 namespace WordPress\Encoding;
 
+use function WordPress\Encoding\compat\_wp_is_valid_utf8_fallback;
+use function WordPress\Encoding\compat\_wp_scrub_utf8_fallback;
+use function WordPress\Encoding\compat\_wp_has_noncharacters_fallback;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
+
 if ( extension_loaded( 'mbstring' ) ) :
 	/**
 	 * Determines if a given byte string represents a valid UTF-8 encoding.
@@ -192,3 +197,134 @@ function wp_has_noncharacters( string $text ): bool {
 		return _wp_has_noncharacters_fallback( $text );
 	}
 endif;
+
+/**
+ * UTF-8 encoding pipeline by Dennis Snell (@dmsnell).
+ *
+ * It enables parsing XML documents with incomplete UTF-8 byte sequences
+ * without crashing or depending on the mbstring extension.
+ */
+
+/**
+ * Encode a code point number into the UTF-8 encoding.
+ *
+ * This encoder implements the UTF-8 encoding algorithm for converting
+ * a code point into a byte sequence. If it receives an invalid code
+ * point it will return the Unicode Replacement Character U+FFFD `�`.
+ *
+ * Example:
+ *
+ *     '🅰' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0x1f170 );
+ *
+ *     // Half of a surrogate pair is an invalid code point.
+ *     '�' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0xd83c );
+ *
+ * @since 6.6.0
+ *
+ * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
+ *
+ * @param int $codepoint Which code point to convert.
+ * @return string Converted code point, or `�` if invalid.
+ */
+function codepoint_to_utf8_bytes( $codepoint ) {
+	// Pre-check to ensure a valid code point.
+	if (
+		$codepoint <= 0 ||
+		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
+		$codepoint > 0x10FFFF
+	) {
+		return '�';
+	}
+
+	if ( $codepoint <= 0x7F ) {
+		return chr( $codepoint );
+	}
+
+	if ( $codepoint <= 0x7FF ) {
+		$byte1 = chr( ( 0xC0 | ( ( $codepoint >> 6 ) & 0x1F ) ) );
+		$byte2 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}";
+	}
+
+	if ( $codepoint <= 0xFFFF ) {
+		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
+		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+		$byte3 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}{$byte3}";
+	}
+
+	// Any values above U+10FFFF are eliminated above in the pre-check.
+	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
+	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
+	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+	$byte4 = chr( $codepoint & 0x3F | 0x80 );
+
+	return "{$byte1}{$byte2}{$byte3}{$byte4}";
+}
+
+
+/*
+ * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
+ * proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
+ *
+ * It enables parsing XML documents with incomplete UTF-8 byte sequences
+ * without crashing or depending on the mbstring extension.
+ */
+
+/**
+ * Extract a unicode codepoint from a specific offset in text.
+ * Invalid byte sequences count as a single code point, U+FFFD
+ * (the Unicode replacement character ``).
+ *
+ * This function does not permit passing negative indices and will return
+ * null if such are provided.
+ *
+ * @param  string $text  Input text from which to extract.
+ * @param  int    $byte_offset  Start at this byte offset in the input text.
+ * @param  int    $matched_bytes  How many bytes were matched to produce the codepoint.
+ *
+ * @return int Unicode codepoint.
+ */
+function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
+	if ( $byte_offset < 0 ) {
+		return null;
+	}
+
+	$new_byte_offset = $byte_offset;
+	if( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
+		return utf8_ord( "\u{FFFD}" );
+	}
+
+	$matched_bytes = $new_byte_offset - $byte_offset;
+	return utf8_ord( substr( $text, $byte_offset, $matched_bytes ) );
+}
+
+/**
+ * Convert a UTF-8 byte sequence to its Unicode codepoint.
+ *
+ * @param  string $character  UTF-8 encoded byte sequence representing a single Unicode character.
+ *
+ * @return int Unicode codepoint.
+ */
+function utf8_ord( string $character ): int {
+	// Convert the byte sequence to its binary representation.
+	$bytes = unpack( 'C*', $character );
+
+	// Initialize the codepoint.
+	$codepoint = 0;
+
+	// Calculate the codepoint based on the number of bytes.
+	if ( 1 === count( $bytes ) ) {
+		$codepoint = $bytes[1];
+	} elseif ( 2 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
+	} elseif ( 3 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
+	} elseif ( 4 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
+	}
+
+	return $codepoint;
+}
diff --git a/composer.json b/composer.json
index fb26426d..6d52bb4b 100644
--- a/composer.json
+++ b/composer.json
@@ -62,7 +62,6 @@
 			"components/DataLiberation/URL/functions.php",
 			"components/Encoding/utf8.php",
 			"components/Encoding/compat-utf8.php",
-			"components/Encoding/utf8-decoder.php",
 			"components/Encoding/utf8-encoder.php",
 			"components/Filesystem/functions.php",
 			"components/Zip/functions.php",

From 15f1a64f3577707f2cce3323e29baa06dfb800a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 00:58:21 +0100
Subject: [PATCH 05/28] update CI autoload

---
 composer-ci-matrix-tests.json | 178 ++++++++++++++++------------------
 1 file changed, 86 insertions(+), 92 deletions(-)

diff --git a/composer-ci-matrix-tests.json b/composer-ci-matrix-tests.json
index 91c612e2..de4747f7 100644
--- a/composer-ci-matrix-tests.json
+++ b/composer-ci-matrix-tests.json
@@ -1,94 +1,88 @@
 {
-    "name": "wordpress/components",
-    "type": "library",
-    "description": "WordPress Components",
-    "keywords": [
-        "wordpress",
-        "components"
-    ],
-    "homepage": "https://wordpress.org",
-    "license": "GPL-2.0-or-later",
-    "authors": [
-        {
-            "name": "Adam Zielinski",
-            "email": "adam@adamziel.com"
-        },
-        {
-            "name": "WordPress Contributors"
-        }
-    ],
-    "require": {
-        "php": ">=7.2",
-        "ext-json": "*",
-        "ext-mbstring": "*"
-    },
-    "require-dev": {
-        "yoast/phpunit-polyfills": "^4.0.0",
-        "phpcompatibility/php-compatibility": "^9.3.5",
-        "phpunit/phpunit": "8.5.x || ^9.5"
-    },
-    "autoload": {
-        "exclude-from-classmap": [
-            "**/Tests/",
-            "**/bin/",
-            "/Tests/"
-        ],
-        "classmap": [
-            "components/BlockParser/",
-            "components/Blueprints/",
-            "components/Blueprints/vendor-patched/",
-            "components/CLI/",
-            "components/DataLiberation/",
-            "components/DataLiberation/vendor-patched/",
-            "components/Filesystem/",
-            "components/Git/",
-            "components/HTML/./",
-            "components/HttpClient/",
-            "components/HttpServer/",
-            "components/Markdown/",
-            "components/Markdown/vendor-patched",
-            "components/Merge/",
-            "components/Merge/vendor-patched",
-            "components/ByteStream/",
-            "components/XML/",
-            "components/Zip/"
-        ],
-        "files": [
-            "components/DataLiberation/URL/functions.php",
-            "components/Encoding/utf8-decoder.php",
-            "components/Encoding/utf8-encoder.php",
-            "components/Filesystem/functions.php",
-            "components/Zip/functions.php",
-            "components/Polyfill/wordpress.php",
-            "components/Polyfill/mbstring.php",
-            "components/Git/functions.php"
-        ],
-        "psr-4": {
-            "Rowbot\\": "components/DataLiberation/vendor-patched/",
-            "Brick\\": "components/DataLiberation/vendor-patched/",
-            "WordPress\\CORSProxy\\": "components/CORSProxy/"
-        }
-    },
-    "scripts": {
-        "build-blueprints-phar": "box compile -c phar-box.json",
-        "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts",
-        "test": "phpunit -c phpunit.xml",
-        "lint": "phpcs .",
-        "lint-fix": "phpcbf ."
-    },
-    "repositories": [
-        {
-            "type": "path",
-            "url": "components/*",
-            "options": {
-                "symlink": true
-            }
-        }
-    ],
-    "minimum-stability": "dev",
-    "config": {
-        "allow-plugins": {
-            "dealerdirect/phpcodesniffer-composer-installer": true
-        }
-    }
+	"name": "wordpress/components",
+	"type": "library",
+	"description": "WordPress Components",
+	"keywords": ["wordpress", "components"],
+	"homepage": "https://wordpress.org",
+	"license": "GPL-2.0-or-later",
+	"authors": [
+		{
+			"name": "Adam Zielinski",
+			"email": "adam@adamziel.com"
+		},
+		{
+			"name": "WordPress Contributors"
+		}
+	],
+	"require": {
+		"php": ">=7.2",
+		"ext-json": "*",
+		"ext-mbstring": "*"
+	},
+	"require-dev": {
+		"yoast/phpunit-polyfills": "^4.0.0",
+		"phpcompatibility/php-compatibility": "^9.3.5",
+		"phpunit/phpunit": "8.5.x || ^9.5"
+	},
+	"autoload": {
+		"exclude-from-classmap": ["**/Tests/", "**/bin/", "/Tests/"],
+		"classmap": [
+			"components/BlockParser/",
+			"components/Blueprints/",
+			"components/Blueprints/vendor-patched/",
+			"components/CLI/",
+			"components/DataLiberation/",
+			"components/DataLiberation/vendor-patched/",
+			"components/Filesystem/",
+			"components/Git/",
+			"components/HTML/./",
+			"components/HttpClient/",
+			"components/HttpServer/",
+			"components/Markdown/",
+			"components/Markdown/vendor-patched",
+			"components/Merge/",
+			"components/Merge/vendor-patched",
+			"components/ByteStream/",
+			"components/XML/",
+			"components/Zip/"
+		],
+		"files": [
+			"components/DataLiberation/URL/functions.php",
+			"components/Encoding/utf8.php",
+			"components/Encoding/compat-utf8.php",
+			"components/Encoding/utf8-encoder.php",
+			"components/Filesystem/functions.php",
+			"components/Zip/functions.php",
+			"components/Polyfill/wordpress.php",
+			"components/Polyfill/mbstring.php",
+			"components/Git/functions.php"
+		],
+		"psr-4": {
+			"Rowbot\\": "components/DataLiberation/vendor-patched/",
+			"Brick\\": "components/DataLiberation/vendor-patched/",
+			"WordPress\\CORSProxy\\": "components/CORSProxy/"
+		}
+	},
+	"scripts": {
+		"build-blueprints-phar": "box compile -c phar-box.json",
+		"regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts",
+		"test": "phpunit -c phpunit.xml",
+		"lint": "phpcs .",
+		"lint-fix": "phpcbf ."
+	},
+	"repositories": [
+		{
+			"type": "path",
+			"url": "components/*",
+			"options": {
+				"symlink": true
+			}
+		}
+	],
+	"minimum-stability": "dev",
+	"config": {
+		"allow-plugins": {
+			"dealerdirect/phpcodesniffer-composer-installer": true
+		}
+	}
 }

From 0499f8e283ac9dc70b3056ccc53ec1dc859a5172 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 00:59:50 +0100
Subject: [PATCH 06/28] phpcs

---
 components/Encoding/utf8.php | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index 034e4228..ddb8c3db 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -55,6 +55,7 @@ function wp_is_valid_utf8( string $bytes ): bool {
 	 *
 	 * @since 6.9.0
 	 */
+	// phpcs:ignore Universal.NamingConventions.NoReservedKeywordParameterNames.stringFound
 	function wp_is_valid_utf8( string $string ): bool {
 		return _wp_is_valid_utf8_fallback( $string );
 	}
@@ -293,7 +294,7 @@ function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes
 	}
 
 	$new_byte_offset = $byte_offset;
-	if( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
+	if ( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
 		return utf8_ord( "\u{FFFD}" );
 	}
 

From ffd2309cdc2abd0d8628f69debd97f646ab68259 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 01:01:07 +0100
Subject: [PATCH 07/28] tests

---
 components/Encoding/utf8.php | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index ddb8c3db..6e99ce00 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -294,6 +294,8 @@ function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes
 	}
 
 	$new_byte_offset = $byte_offset;
+	$invalid_length = 0;
+	// phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedFunctionFound
 	if ( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
 		return utf8_ord( "\u{FFFD}" );
 	}

From eb8201c9d1287fe446eebe8f2d8c7d5078ceb43f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 01:21:04 +0100
Subject: [PATCH 08/28] fix import

---
 components/DataLiberation/URL/class-cssprocessor.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
index a7a1afcb..6690190f 100644
--- a/components/DataLiberation/URL/class-cssprocessor.php
+++ b/components/DataLiberation/URL/class-cssprocessor.php
@@ -2,8 +2,8 @@
 
 namespace WordPress\DataLiberation\URL;
 
-use function WordPress\Encoding\_wp_scan_utf8;
-use function WordPress\Encoding\_wp_scrub_utf8_fallback;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
+use function WordPress\Encoding\compat\_wp_scrub_utf8_fallback;
 use function WordPress\Encoding\utf8_codepoint_at;
 use function WordPress\Encoding\codepoint_to_utf8_bytes;
 

From 963b38070ce5e20aed2b8f375235adfd0955984f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 01:21:27 +0100
Subject: [PATCH 09/28] fix wp_scrub_utf8 reference

---
 components/DataLiberation/URL/class-cssprocessor.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
index 6690190f..9dfee375 100644
--- a/components/DataLiberation/URL/class-cssprocessor.php
+++ b/components/DataLiberation/URL/class-cssprocessor.php
@@ -3,7 +3,7 @@
 namespace WordPress\DataLiberation\URL;
 
 use function WordPress\Encoding\compat\_wp_scan_utf8;
-use function WordPress\Encoding\compat\_wp_scrub_utf8_fallback;
+use function WordPress\Encoding\wp_scrub_utf8;
 use function WordPress\Encoding\utf8_codepoint_at;
 use function WordPress\Encoding\codepoint_to_utf8_bytes;
 
@@ -1528,7 +1528,7 @@ private function consume_ident_start_codepoint( $at ): int {
 	 */
 	private function decode_string_or_url( int $start, int $length ): string {
 		// Fast path: check if any processing is needed.
-		$slice         = _wp_scrub_utf8_fallback( substr( $this->css, $start, $length ) );
+		$slice         = wp_scrub_utf8( substr( $this->css, $start, $length ) );
 		$special_chars = "\\\r\f\x00";
 		if ( false === strpbrk( $slice, $special_chars ) ) {
 			// No special chars - return raw substring (almost zero allocations).

From e43e00e3610fe3f347bf40a5898674ad4d70744b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 01:50:16 +0100
Subject: [PATCH 10/28] set matched_bytes in utf8_codepoint_at

---
 components/Encoding/utf8.php | 1 +
 1 file changed, 1 insertion(+)

diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index 6e99ce00..f981361c 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -297,6 +297,7 @@ function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes
 	$invalid_length = 0;
 	// phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedFunctionFound
 	if ( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
+		$matched_bytes = $invalid_length;
 		return utf8_ord( "\u{FFFD}" );
 	}
 

From 70482ceca035ed4f44bcedee928b2c53fb8bbb9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 02:21:43 +0100
Subject: [PATCH 11/28] set matched_bytes in utf8_codepoint_at

---
 components/Encoding/compat-utf8.php | 2 +-
 components/Encoding/utf8.php        | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php
index 89dafb5c..d261c48d 100644
--- a/components/Encoding/compat-utf8.php
+++ b/components/Encoding/compat-utf8.php
@@ -54,7 +54,7 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
 	$max_count         = $max_code_points ?? PHP_INT_MAX;
 	$has_noncharacters = false;
 
-	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
+	for ( $i = $at; $i < $end && $count < $max_count; $i++ ) {
 		/*
 		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
 		 *
diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index f981361c..d373bab8 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -293,11 +293,18 @@ function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes
 		return null;
 	}
 
+	// Check if we're at or past the end of the string.
+	if ( $byte_offset >= strlen( $text ) ) {
+		$matched_bytes = 0;
+		return null;
+	}
+
 	$new_byte_offset = $byte_offset;
 	$invalid_length = 0;
 	// phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedFunctionFound
 	if ( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
-		$matched_bytes = $invalid_length;
+		// Ensure we always advance at least 1 byte to avoid infinite loops.
+		$matched_bytes = max( 1, $invalid_length );
 		return utf8_ord( "\u{FFFD}" );
 	}
 

From 0ea848847734b28fadf7da74f023841cf8878e5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 15:33:32 +0100
Subject: [PATCH 12/28] Fix infinite loop

---
 components/DataLiberation/URL/class-cssprocessor.php | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
index 9dfee375..544f2dbc 100644
--- a/components/DataLiberation/URL/class-cssprocessor.php
+++ b/components/DataLiberation/URL/class-cssprocessor.php
@@ -1475,7 +1475,7 @@ private function consume_ident_codepoint( $at ): int {
 	 * @return int The number of bytes consumed.
 	 */
 	private function consume_ident_start_codepoint( $at ): int {
-		if ( $at > $this->length ) {
+		if ( $at >= $this->length ) {
 			return 0;
 		}
 
@@ -1500,9 +1500,11 @@ private function consume_ident_start_codepoint( $at ): int {
 			 *
 			 * We'll move forward by $invalid_length bytes and continue processing.
 			 * Later on, during the string decoding, we'll replace the invalid bytes with U+FFFD
-			 * via maximal subpart”replacement.
+			 * via maximal subpart"replacement.
+			 *
+			 * Ensure we always return at least 1 byte to avoid infinite loops.
 			 */
-			return $invalid_length;
+			return max( 1, $invalid_length );
 		}
 
 		$codepoint_byte_length = $new_at - $at;

From 9db35e03e5c8b14fa3dd4ed22b8670236623d6a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:22:10 +0100
Subject: [PATCH 13/28] Restore the original utf8-decoder to see if that fixies
 the CI-only infinite loop

---
 components/Encoding/composer.json    |   3 +-
 components/Encoding/utf8-decoder.php | 866 +++++++++++++++++++++++++++
 components/Encoding/utf8.php         | 141 -----
 composer.json                        |   1 +
 4 files changed, 869 insertions(+), 142 deletions(-)
 create mode 100644 components/Encoding/utf8-decoder.php

diff --git a/components/Encoding/composer.json b/components/Encoding/composer.json
index 72058a5c..187b7559 100644
--- a/components/Encoding/composer.json
+++ b/components/Encoding/composer.json
@@ -20,7 +20,8 @@
 		"files": [
 			"utf8.php",
 			"compat-utf8.php",
-			"utf8-encoder.php"
+			"utf8-encoder.php",
+			"utf8-decoder.php"
 		],
 		"exclude-from-classmap": [
 			"/Tests/"
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
new file mode 100644
index 00000000..e339e710
--- /dev/null
+++ b/components/Encoding/utf8-decoder.php
@@ -0,0 +1,866 @@
+<?php
+
+namespace WordPress\Encoding;
+
+/*
+ * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
+ * proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
+ *
+ * It enables parsing XML documents with incomplete UTF-8 byte sequences
+ * without crashing or depending on the mbstring extension.
+ */
+
+if ( ! defined( 'UTF8_DECODER_ACCEPT' ) ) {
+	define( 'UTF8_DECODER_ACCEPT', 0 );
+}
+
+if ( ! defined( 'UTF8_DECODER_REJECT' ) ) {
+	define( 'UTF8_DECODER_REJECT', 1 );
+}
+
+/**
+ * Finds spans of valid and invalid UTF-8 bytes in a given string.
+ *
+ * This is a low-level tool to power various UTF-8 functionality.
+ * It scans through a string until it finds invalid byte spans.
+ * When it does this, it does three things:
+ *
+ *  - Assigns `$at` to the position after the last successful code point.
+ *  - Assigns `$invalid_length` to the length of the maximal subpart of
+ *    the invalid bytes starting at `$at`.
+ *  - Returns how many code points were successfully scanned.
+ *
+ * This information is enough to build a number of useful UTF-8 functions.
+ *
+ * Example:
+ *
+ *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
+ *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
+ *     $at = $invalid_length = 0;
+ *
+ *     // The first step finds the invalid 0xF1 byte.
+ *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 2; $invalid_length === 1;
+ *
+ *     // The second step continues to the end of the string.
+ *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 4; $invalid_length === 0;
+ *
+ * Note! While passing an options array here might be convenient from a calling-code standpoint,
+ *       this function is intended to serve as a very low-level foundation upon which to build
+ *       higher level functionality. For the sake of keeping costs explicit all arguments are
+ *       passed directly.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string    $bytes             UTF-8 encoded string which might include invalid spans of bytes.
+ * @param int       $at                Where to start scanning.
+ * @param int       $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
+ * @param int|null  $max_bytes         Stop scanning after this many bytes have been seen.
+ * @param int|null  $max_code_points   Stop scanning after this many code points have been seen.
+ * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
+ * @return int How many code points were successfully scanned.
+ */
+function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+	$byte_length       = strlen( $bytes );
+	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
+	$invalid_length    = 0;
+	$count             = 0;
+	$max_count         = $max_code_points ?? PHP_INT_MAX;
+	$has_noncharacters = false;
+
+	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
+		/*
+		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
+		 *
+		 * This optimization step improves the speed from 10x to 100x
+		 * depending on whether the JIT has optimized the function.
+		 */
+		$ascii_byte_count = strspn(
+			$bytes,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$i,
+			$end - $i
+		);
+
+		if ( $count + $ascii_byte_count >= $max_count ) {
+			$at    = $i + ( $max_count - $count );
+			$count = $max_count;
+			return $count;
+		}
+
+		$count += $ascii_byte_count;
+		$i     += $ascii_byte_count;
+
+		if ( $i >= $end ) {
+			$at = $end;
+			return $count;
+		}
+
+		/**
+		 * The above fast-track handled all single-byte UTF-8 characters. What
+		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
+		 *
+		 * Therefore everything past here is checking those multibyte sequences.
+		 *
+		 * It may look like there’s a need to check against the max bytes here,
+		 * but since each match of a single character returns, this functions will
+		 * bail already if crossing the max-bytes threshold. This function SHALL
+		 * NOT return in the middle of a multi-byte character, so if a character
+		 * falls on each side of the max bytes, the entire character will be scanned.
+		 *
+		 * Because it’s possible that there are truncated characters, the use of
+		 * the null-coalescing operator with "\xC0" is a convenience for skipping
+		 * length checks on every continuation bytes. This works because 0xC0 is
+		 * always invalid in a UTF-8 string, meaning that if the string has been
+		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
+		 *
+		 * > [The following table] lists all of the byte sequences that are well-formed
+		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
+		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
+		 * > outside of the ranges listed is ill-formed.
+		 *
+		 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
+		 *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
+		 *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
+		 *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
+		 *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
+		 *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
+		 *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
+		 *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
+		 *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
+		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
+		 *
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
+		 */
+
+		// Valid two-byte code points.
+		$b1 = ord( $bytes[ $i ] );
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+
+		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+			++$count;
+			++$i;
+			continue;
+		}
+
+		// Valid three-byte code points.
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		if ( $b3 < 0x80 || $b3 > 0xBF ) {
+			goto invalid_utf8;
+		}
+
+		if (
+			( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+		) {
+			++$count;
+			$i += 2;
+
+			// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
+			if ( 0xEF === $b1 ) {
+				$has_noncharacters |= (
+					( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
+					( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
+				);
+			}
+
+			continue;
+		}
+
+		// Valid four-byte code points.
+		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
+
+		if ( $b4 < 0x80 || $b4 > 0xBF ) {
+			goto invalid_utf8;
+		}
+
+		if (
+			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+		) {
+			++$count;
+			$i += 3;
+
+			// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
+			$has_noncharacters |= (
+				( 0x0F === ( $b2 & 0x0F ) ) &&
+				0xBF === $b3 &&
+				( 0xBE === $b4 || 0xBF === $b4 )
+			);
+
+			continue;
+		}
+
+		/**
+		 * When encountering invalid byte sequences, Unicode suggests finding the
+		 * maximal subpart of a text and replacing that subpart with a single
+		 * replacement character.
+		 *
+		 * > This practice is more secure because it does not result in the
+		 * > conversion consuming parts of valid sequences as though they were
+		 * > invalid. It also guarantees at least one replacement character will
+		 * > occur for each instance of an invalid sequence in the original text.
+		 * > Furthermore, this practice can be defined consistently for better
+		 * > interoperability between different implementations of conversion.
+		 *
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
+		 */
+		invalid_utf8:
+		$at             = $i;
+		$invalid_length = 1;
+
+		// Single-byte and two-byte characters.
+		if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
+			return $count;
+		}
+
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		// Find the maximal subpart and skip past it.
+		if ( 0xE0 === ( $b1 & 0xF0 ) ) {
+			// Three-byte characters.
+			$b2_valid = (
+				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+			);
+
+			$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
+			return $count;
+		} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
+			// Four-byte characters.
+			$b2_valid = (
+				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+			);
+
+			$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
+
+			$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
+			return $count;
+		}
+
+		return $count;
+	}
+
+	$at = $i;
+	return $count;
+}
+
+/**
+ * Fallback mechanism for safely validating UTF-8 bytes.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see wp_is_valid_utf8()
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function _wp_is_valid_utf8_fallback( string $bytes ): bool {
+	$bytes_length = strlen( $bytes );
+	if ( 0 === $bytes_length ) {
+		return true;
+	}
+
+	$next_byte_at   = 0;
+	$invalid_length = 0;
+
+	_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+
+	return $bytes_length === $next_byte_at && 0 === $invalid_length;
+}
+
+/**
+ * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
+ *
+ * Example:
+ *
+ *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see wp_scrub_utf8()
+ *
+ * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
+ * @return string Input string with spans of invalid bytes swapped with the replacement character.
+ */
+function _wp_scrub_utf8_fallback( string $bytes ): string {
+	$bytes_length   = strlen( $bytes );
+	$next_byte_at   = 0;
+	$was_at         = 0;
+	$invalid_length = 0;
+	$scrubbed       = '';
+
+	while ( $next_byte_at <= $bytes_length ) {
+		_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+
+		if ( $next_byte_at >= $bytes_length ) {
+			if ( 0 === $was_at ) {
+				return $bytes;
+			}
+
+			return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
+		}
+
+		$scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
+		$scrubbed .= "\u{FFFD}";
+
+		$next_byte_at += $invalid_length;
+		$was_at        = $next_byte_at;
+	}
+
+	return $scrubbed;
+}
+
+/**
+ * Returns how many code points are found in the given UTF-8 string.
+ *
+ * Invalid spans of bytes count as a single code point according
+ * to the maximal subpart rule. This function is a fallback method
+ * for calling `mb_strlen( $text, 'UTF-8' )`.
+ *
+ * When negative values are provided for the byte offsets or length,
+ * this will always report zero code points.
+ *
+ * Example:
+ *
+ *     4  === _wp_utf8_codepoint_count( 'text' );
+ *
+ *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
+ *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text            Count code points in this string.
+ * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
+ * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
+ *                                Default is to scan until the end of the string. Must be positive.
+ * @return int How many code points were found.
+ */
+function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+	if ( $byte_offset < 0 ) {
+		return 0;
+	}
+
+	$count           = 0;
+	$at              = $byte_offset;
+	$end             = strlen( $text );
+	$invalid_length  = 0;
+	$max_byte_length = min( $end - $at, $max_byte_length );
+
+	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
+		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
+		$count += $invalid_length > 0 ? 1 : 0;
+		$at    += $invalid_length;
+	}
+
+	return $count;
+}
+
+/**
+ * Given a starting offset within a string and a maximum number of code points,
+ * return how many bytes are occupied by the span of characters.
+ *
+ * Invalid spans of bytes count as a single code point according to the maximal
+ * subpart rule. This function is a fallback method for calling
+ * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text              Count bytes of span in this text.
+ * @param int    $byte_offset       Start counting at this byte offset.
+ * @param int    $max_code_points   Stop counting after this many code points have been seen,
+ *                                  or at the end of the string.
+ * @param ?int   $found_code_points Optional. Will be set to number of found code points in
+ *                                  span, as this might be smaller than the maximum count if
+ *                                  the string is not long enough.
+ * @return int Number of bytes spanned by the code points.
+ */
+function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+	$was_at            = $byte_offset;
+	$invalid_length    = 0;
+	$end               = strlen( $text );
+	$found_code_points = 0;
+
+	while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
+		$needed      = $max_code_points - $found_code_points;
+		$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
+
+		$found_code_points += $chunk_count;
+
+		// Invalid spans only convey one code point count regardless of how long they are.
+		if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
+			++$found_code_points;
+			$byte_offset += $invalid_length;
+		}
+	}
+
+	return $byte_offset - $was_at;
+}
+
+/**
+ * Fallback support for determining if a string contains Unicode noncharacters.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \wp_has_noncharacters()
+ *
+ * @param string $text Are there noncharacters in this string?
+ * @return bool Whether noncharacters were found in the string.
+ */
+function _wp_has_noncharacters_fallback( string $text ): bool {
+	$at                = 0;
+	$invalid_length    = 0;
+	$has_noncharacters = false;
+	$end               = strlen( $text );
+
+	while ( $at < $end && ! $has_noncharacters ) {
+		_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
+		$at += $invalid_length;
+	}
+
+	return $has_noncharacters;
+}
+
+/**
+ * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \utf8_encode()
+ *
+ * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
+ * @return string Text converted into UTF-8.
+ */
+function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
+	$iso_8859_1_text = (string) $iso_8859_1_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $iso_8859_1_text );
+	$utf8            = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$iso_8859_1_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		// All other bytes transform into two-byte UTF-8 sequences.
+		$code_point = ord( $iso_8859_1_text[ $at ] );
+		$byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
+		$byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
+
+		$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
+		$utf8 .= "{$byte1}{$byte2}";
+
+		++$at;
+		$was_at = $at;
+	}
+
+	if ( 0 === $was_at ) {
+		return $iso_8859_1_text;
+	}
+
+	$utf8 .= substr( $iso_8859_1_text, $was_at );
+	return $utf8;
+}
+
+/**
+ * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see utf8_decode()
+ *
+ * @param string $utf8_text Text treated as UTF-8 bytes.
+ * @return string Text converted into ISO-8859-1.
+ */
+function _wp_utf8_decode_fallback( $utf8_text ) {
+	$utf8_text       = (string) $utf8_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $utf8_text );
+	$iso_8859_1_text = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$utf8_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		$next_at        = $at;
+		$invalid_length = 0;
+		$found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
+		$span_length    = $next_at - $at;
+		$next_byte      = '?';
+
+		if ( 1 !== $found ) {
+			if ( $invalid_length > 0 ) {
+				$next_byte = '';
+				goto flush_sub_part;
+			}
+
+			break;
+		}
+
+		// All convertible code points are two-bytes long.
+		$byte1 = ord( $utf8_text[ $at ] );
+		if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
+			goto flush_sub_part;
+		}
+
+		// All convertible code points are not greater than U+FF.
+		$byte2      = ord( $utf8_text[ $at + 1 ] );
+		$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
+		if ( $code_point > 0xFF ) {
+			goto flush_sub_part;
+		}
+
+		$next_byte = chr( $code_point );
+
+		flush_sub_part:
+		$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
+		$iso_8859_1_text .= $next_byte;
+		$at              += $span_length;
+		$was_at           = $at;
+
+		if ( $invalid_length > 0 ) {
+			$iso_8859_1_text .= '?';
+			$at              += $invalid_length;
+			$was_at           = $at;
+		}
+	}
+
+	if ( 0 === $was_at ) {
+		return $utf8_text;
+	}
+
+	$iso_8859_1_text .= substr( $utf8_text, $was_at );
+	return $iso_8859_1_text;
+}
+
+/**
+ * Indicates if a given byte stream represents valid UTF-8.
+ *
+ * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
+ *
+ * Example:
+ *
+ *     true  === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
+ *
+ *     false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
+ *     12    === $error_at;
+ *
+ *     false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
+ *     23    === $error_at;
+ *
+ *     false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
+ *     15    === $error_at;
+ *
+ * @param  string   $bytes  Text to validate as UTF-8 bytes.
+ * @param  int      $starting_byte  Byte offset in string where decoding should begin.
+ * @param  int|null $first_error_byte_at  Optional. If provided and byte stream fails to validate,
+ *                                     will be set to the byte offset where the first invalid
+ *                                     byte appeared. Otherwise, will not be set.
+ *
+ * @return bool Whether the given byte stream represents valid UTF-8.
+ * @since {WP_VERSION}
+ */
+function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, ?int &$first_error_byte_at = null ): bool {
+	$state         = UTF8_DECODER_ACCEPT;
+	$last_start_at = $starting_byte;
+
+	for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			$last_start_at = $at;
+		}
+
+		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
+	}
+
+	if ( UTF8_DECODER_ACCEPT === $state ) {
+		return true;
+	} else {
+		$first_error_byte_at = $last_start_at;
+
+		return false;
+	}
+}
+
+/**
+ * Returns number of code points found within a UTF-8 string, similar to `strlen()`.
+ *
+ * If the byte stream fails to properly decode as UTF-8 this function will set the
+ * byte index of the first error byte and report the number of decoded code points.
+ *
+ * @param  string   $bytes  Text for which to count code points.
+ * @param  int|null $first_error_byte_at  Optional. If provided, will be set upon finding
+ *                                     the first invalid byte.
+ *
+ * @return int How many code points were decoded in the given byte stream before an error
+ *             or before reaching the end of the string.
+ * @since {WP_VERSION}
+ */
+function utf8_codepoint_count( string $bytes, ?int &$first_error_byte_at = null ): int {
+	$state         = UTF8_DECODER_ACCEPT;
+	$last_start_at = 0;
+	$count         = 0;
+	$codepoint     = 0;
+
+	for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			$last_start_at = $at;
+		}
+
+		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint );
+
+		if ( UTF8_DECODER_ACCEPT === $state ) {
+			++$count;
+		}
+	}
+
+	if ( UTF8_DECODER_ACCEPT !== $state ) {
+		$first_error_byte_at = $last_start_at;
+	}
+
+	return $count;
+}
+
+/**
+ * Inner loop for a number of UTF-8 decoding-related functions.
+ *
+ * You probably don't need this! This is highly-specific and optimized
+ * code for UTF-8 operations used in other functions.
+ *
+ * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ *
+ * @since {WP_VERSION}
+ *
+ * @access private
+ *
+ * @param  string   $byte  Next byte to be applied in UTF-8 decoding or validation.
+ * @param  int      $state  UTF-8 decoding state, one of the following values:<br><ul>
+ *                                 <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
+ *                                 <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
+ *                                 Any other positive value: Decoder is waiting for additional bytes.
+ * @param  int|null $codepoint  Optional. If provided, will accumulate the decoded code point as
+ *                            each byte is processed. If not provided or unable to decode, will
+ *                            not be set, or will be set to invalid and unusable data.
+ *
+ * @return int Next decoder state after processing the current byte.
+ */
+function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 ): int {
+	/**
+	 * State classification and transition table for UTF-8 validation.
+	 *
+	 * > The first part of the table maps bytes to character classes that
+	 * > to reduce the size of the transition table and create bitmasks.
+	 * >
+	 * > The second part is a transition table that maps a combination
+	 * > of a state of the automaton and a character class to a state.
+	 *
+	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+	 */
+	static $state_table = (
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
+		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
+		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
+		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
+		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
+		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
+		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+	);
+
+	$byte      = ord( $byte );
+	$type      = ord( $state_table[ $byte ] );
+	$codepoint = ( UTF8_DECODER_ACCEPT === $state )
+		? ( ( 0xFF >> $type ) & $byte )
+		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
+
+	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
+}
+
+/**
+ * Extract a slice of a text by code point, where invalid byte sequences count
+ * as a single code point, U+FFFD (the Unicode replacement character `�`).
+ *
+ * This function does not permit passing negative indices and will return
+ * the original string if such are provide.
+ *
+ * @param  string $text  Input text from which to extract.
+ * @param  int    $from  Start extracting after this many code-points.
+ * @param  int    $length  Extract this many code points.
+ *
+ * @return string Extracted slice of input string.
+ */
+function utf8_substr( string $text, int $from = 0, ?int $length = null ): string {
+	if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
+		return $text;
+	}
+
+	$position_in_input = 0;
+	$codepoint_at      = 0;
+	$end_byte          = strlen( $text );
+	$buffer            = '';
+	$seen_codepoints   = 0;
+	$sliced_codepoints = 0;
+	$decoder_state     = UTF8_DECODER_ACCEPT;
+
+	// Get to the start of the string.
+	while ( $position_in_input < $end_byte && $seen_codepoints < $length ) {
+		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
+
+		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
+			++$position_in_input;
+
+			if ( $seen_codepoints >= $from ) {
+				++$sliced_codepoints;
+				$buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at );
+			}
+
+			++$seen_codepoints;
+			$codepoint_at = $position_in_input;
+		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
+			// "\u{FFFD}" is not supported in PHP 5.6.
+			$buffer .= "\xEF\xBF\xBD";
+
+			// Skip to the start of the next code point.
+			while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
+				$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
+			}
+
+			++$seen_codepoints;
+			$codepoint_at  = $position_in_input;
+			$decoder_state = UTF8_DECODER_ACCEPT;
+		} else {
+			++$position_in_input;
+		}
+	}
+
+	return $buffer;
+}
+
+/**
+ * Extract a unicode codepoint from a specific offset in text.
+ * Invalid byte sequences count as a single code point, U+FFFD
+ * (the Unicode replacement character ``).
+ *
+ * This function does not permit passing negative indices and will return
+ * null if such are provided.
+ *
+ * @param  string $text  Input text from which to extract.
+ * @param  int    $byte_offset  Start at this byte offset in the input text.
+ * @param  int    $matched_bytes  How many bytes were matched to produce the codepoint.
+ *
+ * @return int Unicode codepoint.
+ */
+function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
+	if ( $byte_offset < 0 ) {
+		return null;
+	}
+
+	$position_in_input = $byte_offset;
+	$codepoint_at      = $byte_offset;
+	$end_byte          = strlen( $text );
+	$codepoint         = null;
+	$decoder_state     = UTF8_DECODER_ACCEPT;
+
+	// Get to the start of the string.
+	while ( $position_in_input < $end_byte ) {
+		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
+
+		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
+			++$position_in_input;
+			$codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) );
+			break;
+		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
+			// "\u{FFFD}" is not supported in PHP 5.6.
+			$codepoint = utf8_ord( "\xEF\xBF\xBD" );
+			break;
+		} else {
+			++$position_in_input;
+		}
+	}
+
+	$matched_bytes = $position_in_input - $byte_offset;
+
+	return $codepoint;
+}
+
+/**
+ * Convert a UTF-8 byte sequence to its Unicode codepoint.
+ *
+ * @param  string $character  UTF-8 encoded byte sequence representing a single Unicode character.
+ *
+ * @return int Unicode codepoint.
+ */
+function utf8_ord( string $character ): int {
+	// Convert the byte sequence to its binary representation.
+	$bytes = unpack( 'C*', $character );
+
+	// Initialize the codepoint.
+	$codepoint = 0;
+
+	// Calculate the codepoint based on the number of bytes.
+	if ( 1 === count( $bytes ) ) {
+		$codepoint = $bytes[1];
+	} elseif ( 2 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
+	} elseif ( 3 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
+	} elseif ( 4 === count( $bytes ) ) {
+		$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
+	}
+
+	return $codepoint;
+}
diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index d373bab8..299bd273 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -198,144 +198,3 @@ function wp_has_noncharacters( string $text ): bool {
 		return _wp_has_noncharacters_fallback( $text );
 	}
 endif;
-
-/**
- * UTF-8 encoding pipeline by Dennis Snell (@dmsnell).
- *
- * It enables parsing XML documents with incomplete UTF-8 byte sequences
- * without crashing or depending on the mbstring extension.
- */
-
-/**
- * Encode a code point number into the UTF-8 encoding.
- *
- * This encoder implements the UTF-8 encoding algorithm for converting
- * a code point into a byte sequence. If it receives an invalid code
- * point it will return the Unicode Replacement Character U+FFFD `�`.
- *
- * Example:
- *
- *     '🅰' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0x1f170 );
- *
- *     // Half of a surrogate pair is an invalid code point.
- *     '�' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0xd83c );
- *
- * @since 6.6.0
- *
- * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
- *
- * @param int $codepoint Which code point to convert.
- * @return string Converted code point, or `�` if invalid.
- */
-function codepoint_to_utf8_bytes( $codepoint ) {
-	// Pre-check to ensure a valid code point.
-	if (
-		$codepoint <= 0 ||
-		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
-		$codepoint > 0x10FFFF
-	) {
-		return '�';
-	}
-
-	if ( $codepoint <= 0x7F ) {
-		return chr( $codepoint );
-	}
-
-	if ( $codepoint <= 0x7FF ) {
-		$byte1 = chr( ( 0xC0 | ( ( $codepoint >> 6 ) & 0x1F ) ) );
-		$byte2 = chr( $codepoint & 0x3F | 0x80 );
-
-		return "{$byte1}{$byte2}";
-	}
-
-	if ( $codepoint <= 0xFFFF ) {
-		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
-		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
-		$byte3 = chr( $codepoint & 0x3F | 0x80 );
-
-		return "{$byte1}{$byte2}{$byte3}";
-	}
-
-	// Any values above U+10FFFF are eliminated above in the pre-check.
-	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
-	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
-	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
-	$byte4 = chr( $codepoint & 0x3F | 0x80 );
-
-	return "{$byte1}{$byte2}{$byte3}{$byte4}";
-}
-
-
-/*
- * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
- * proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
- *
- * It enables parsing XML documents with incomplete UTF-8 byte sequences
- * without crashing or depending on the mbstring extension.
- */
-
-/**
- * Extract a unicode codepoint from a specific offset in text.
- * Invalid byte sequences count as a single code point, U+FFFD
- * (the Unicode replacement character ``).
- *
- * This function does not permit passing negative indices and will return
- * null if such are provided.
- *
- * @param  string $text  Input text from which to extract.
- * @param  int    $byte_offset  Start at this byte offset in the input text.
- * @param  int    $matched_bytes  How many bytes were matched to produce the codepoint.
- *
- * @return int Unicode codepoint.
- */
-function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
-	if ( $byte_offset < 0 ) {
-		return null;
-	}
-
-	// Check if we're at or past the end of the string.
-	if ( $byte_offset >= strlen( $text ) ) {
-		$matched_bytes = 0;
-		return null;
-	}
-
-	$new_byte_offset = $byte_offset;
-	$invalid_length = 0;
-	// phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedFunctionFound
-	if ( 1 !== _wp_scan_utf8( $text, $new_byte_offset, $invalid_length, null, 1 ) ) {
-		// Ensure we always advance at least 1 byte to avoid infinite loops.
-		$matched_bytes = max( 1, $invalid_length );
-		return utf8_ord( "\u{FFFD}" );
-	}
-
-	$matched_bytes = $new_byte_offset - $byte_offset;
-	return utf8_ord( substr( $text, $byte_offset, $matched_bytes ) );
-}
-
-/**
- * Convert a UTF-8 byte sequence to its Unicode codepoint.
- *
- * @param  string $character  UTF-8 encoded byte sequence representing a single Unicode character.
- *
- * @return int Unicode codepoint.
- */
-function utf8_ord( string $character ): int {
-	// Convert the byte sequence to its binary representation.
-	$bytes = unpack( 'C*', $character );
-
-	// Initialize the codepoint.
-	$codepoint = 0;
-
-	// Calculate the codepoint based on the number of bytes.
-	if ( 1 === count( $bytes ) ) {
-		$codepoint = $bytes[1];
-	} elseif ( 2 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
-	} elseif ( 3 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
-	} elseif ( 4 === count( $bytes ) ) {
-		$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
-	}
-
-	return $codepoint;
-}
diff --git a/composer.json b/composer.json
index 6d52bb4b..1423cfa3 100644
--- a/composer.json
+++ b/composer.json
@@ -63,6 +63,7 @@
 			"components/Encoding/utf8.php",
 			"components/Encoding/compat-utf8.php",
 			"components/Encoding/utf8-encoder.php",
+			"components/Encoding/utf8-decoder.php",
 			"components/Filesystem/functions.php",
 			"components/Zip/functions.php",
 			"components/Polyfill/wordpress.php",

From b3d5d1f599eed83ca8b737042f1b5b7b6236cf01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:27:09 +0100
Subject: [PATCH 14/28] Restore the original utf8 encoder

---
 components/Encoding/utf8-encoder.php | 66 ++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/components/Encoding/utf8-encoder.php b/components/Encoding/utf8-encoder.php
index 7cd7b287..3406b1ea 100644
--- a/components/Encoding/utf8-encoder.php
+++ b/components/Encoding/utf8-encoder.php
@@ -1,3 +1,69 @@
 <?php
 
 namespace WordPress\Encoding;
+
+/**
+ * UTF-8 encoding pipeline by Dennis Snell (@dmsnell).
+ *
+ * It enables parsing XML documents with incomplete UTF-8 byte sequences
+ * without crashing or depending on the mbstring extension.
+ */
+
+/**
+ * Encode a code point number into the UTF-8 encoding.
+ *
+ * This encoder implements the UTF-8 encoding algorithm for converting
+ * a code point into a byte sequence. If it receives an invalid code
+ * point it will return the Unicode Replacement Character U+FFFD `�`.
+ *
+ * Example:
+ *
+ *     '🅰' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0x1f170 );
+ *
+ *     // Half of a surrogate pair is an invalid code point.
+ *     '�' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0xd83c );
+ *
+ * @since 6.6.0
+ *
+ * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
+ *
+ * @param int $codepoint Which code point to convert.
+ * @return string Converted code point, or `�` if invalid.
+ */
+function codepoint_to_utf8_bytes( $codepoint ) {
+	// Pre-check to ensure a valid code point.
+	if (
+		$codepoint <= 0 ||
+		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
+		$codepoint > 0x10FFFF
+	) {
+		return '�';
+	}
+
+	if ( $codepoint <= 0x7F ) {
+		return chr( $codepoint );
+	}
+
+	if ( $codepoint <= 0x7FF ) {
+		$byte1 = chr( ( 0xC0 | ( ( $codepoint >> 6 ) & 0x1F ) ) );
+		$byte2 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}";
+	}
+
+	if ( $codepoint <= 0xFFFF ) {
+		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
+		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+		$byte3 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}{$byte3}";
+	}
+
+	// Any values above U+10FFFF are eliminated above in the pre-check.
+	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
+	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
+	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+	$byte4 = chr( $codepoint & 0x3F | 0x80 );
+
+	return "{$byte1}{$byte2}{$byte3}{$byte4}";
+}

From c9250d21cce8b8b79d0bc2ca9d4afc8ed34772b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:32:07 +0100
Subject: [PATCH 15/28] Improve CI autoloading

---
 composer-ci-matrix-tests.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/composer-ci-matrix-tests.json b/composer-ci-matrix-tests.json
index de4747f7..948e04a0 100644
--- a/composer-ci-matrix-tests.json
+++ b/composer-ci-matrix-tests.json
@@ -51,6 +51,7 @@
 			"components/Encoding/utf8.php",
 			"components/Encoding/compat-utf8.php",
 			"components/Encoding/utf8-encoder.php",
+			"components/Encoding/utf8-decoder.php",
 			"components/Filesystem/functions.php",
 			"components/Zip/functions.php",
 			"components/Polyfill/wordpress.php",

From 894d8770ce98bb89e9f1163eb73e316ec166e64f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:43:59 +0100
Subject: [PATCH 16/28] Use wp_is_valid_utf8 instead of wp_has_noncharacters

---
 components/Blueprints/class-runner.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/Blueprints/class-runner.php b/components/Blueprints/class-runner.php
index c4471d70..ee8ea209 100644
--- a/components/Blueprints/class-runner.php
+++ b/components/Blueprints/class-runner.php
@@ -55,7 +55,7 @@
 use WordPress\HttpClient\Client;
 use WordPress\Zip\ZipFilesystem;
 
-use function WordPress\Encoding\wp_has_noncharacters;
+use function WordPress\Encoding\wp_is_valid_utf8;
 use function WordPress\Filesystem\wp_unix_sys_get_temp_dir;
 use function WordPress\Zip\is_zip_file_stream;
 
@@ -379,7 +379,7 @@ private function load_blueprint() {
 		// Validate the Blueprint string we've just loaded.
 
 		// **UTF-8 Encoding:** Assert the Blueprint input is UTF-8 encoded.
-		$is_valid_utf8 = ! wp_has_noncharacters( $blueprint_string );
+		$is_valid_utf8 = ! wp_is_valid_utf8( $blueprint_string );
 
 		if ( ! $is_valid_utf8 ) {
 			throw new BlueprintExecutionException( 'Blueprint must be encoded as UTF-8.' );

From e1c078f35c3b5e6f35c1852236c5e862b68face5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:44:51 +0100
Subject: [PATCH 17/28] Remove unused import

---
 components/Encoding/utf8.php | 1 -
 1 file changed, 1 deletion(-)

diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php
index 299bd273..0c74f7c1 100644
--- a/components/Encoding/utf8.php
+++ b/components/Encoding/utf8.php
@@ -5,7 +5,6 @@
 use function WordPress\Encoding\compat\_wp_is_valid_utf8_fallback;
 use function WordPress\Encoding\compat\_wp_scrub_utf8_fallback;
 use function WordPress\Encoding\compat\_wp_has_noncharacters_fallback;
-use function WordPress\Encoding\compat\_wp_scan_utf8;
 
 if ( extension_loaded( 'mbstring' ) ) :
 	/**

From fc74f22d6a3f87929102b4a397d0bfee8df52367 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:45:46 +0100
Subject: [PATCH 18/28] Remove duplicate wp_utf8 function declarations

---
 components/Encoding/utf8-decoder.php | 564 ---------------------------
 1 file changed, 564 deletions(-)

diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index e339e710..7770440e 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -18,570 +18,6 @@
 	define( 'UTF8_DECODER_REJECT', 1 );
 }
 
-/**
- * Finds spans of valid and invalid UTF-8 bytes in a given string.
- *
- * This is a low-level tool to power various UTF-8 functionality.
- * It scans through a string until it finds invalid byte spans.
- * When it does this, it does three things:
- *
- *  - Assigns `$at` to the position after the last successful code point.
- *  - Assigns `$invalid_length` to the length of the maximal subpart of
- *    the invalid bytes starting at `$at`.
- *  - Returns how many code points were successfully scanned.
- *
- * This information is enough to build a number of useful UTF-8 functions.
- *
- * Example:
- *
- *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
- *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
- *     $at = $invalid_length = 0;
- *
- *     // The first step finds the invalid 0xF1 byte.
- *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
- *     $at === 2; $invalid_length === 1;
- *
- *     // The second step continues to the end of the string.
- *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
- *     $at === 4; $invalid_length === 0;
- *
- * Note! While passing an options array here might be convenient from a calling-code standpoint,
- *       this function is intended to serve as a very low-level foundation upon which to build
- *       higher level functionality. For the sake of keeping costs explicit all arguments are
- *       passed directly.
- *
- * @since 6.9.0
- * @access private
- *
- * @param string    $bytes             UTF-8 encoded string which might include invalid spans of bytes.
- * @param int       $at                Where to start scanning.
- * @param int       $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
- * @param int|null  $max_bytes         Stop scanning after this many bytes have been seen.
- * @param int|null  $max_code_points   Stop scanning after this many code points have been seen.
- * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
- * @return int How many code points were successfully scanned.
- */
-function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
-	$byte_length       = strlen( $bytes );
-	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
-	$invalid_length    = 0;
-	$count             = 0;
-	$max_count         = $max_code_points ?? PHP_INT_MAX;
-	$has_noncharacters = false;
-
-	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
-		/*
-		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
-		 *
-		 * This optimization step improves the speed from 10x to 100x
-		 * depending on whether the JIT has optimized the function.
-		 */
-		$ascii_byte_count = strspn(
-			$bytes,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$i,
-			$end - $i
-		);
-
-		if ( $count + $ascii_byte_count >= $max_count ) {
-			$at    = $i + ( $max_count - $count );
-			$count = $max_count;
-			return $count;
-		}
-
-		$count += $ascii_byte_count;
-		$i     += $ascii_byte_count;
-
-		if ( $i >= $end ) {
-			$at = $end;
-			return $count;
-		}
-
-		/**
-		 * The above fast-track handled all single-byte UTF-8 characters. What
-		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
-		 *
-		 * Therefore everything past here is checking those multibyte sequences.
-		 *
-		 * It may look like there’s a need to check against the max bytes here,
-		 * but since each match of a single character returns, this functions will
-		 * bail already if crossing the max-bytes threshold. This function SHALL
-		 * NOT return in the middle of a multi-byte character, so if a character
-		 * falls on each side of the max bytes, the entire character will be scanned.
-		 *
-		 * Because it’s possible that there are truncated characters, the use of
-		 * the null-coalescing operator with "\xC0" is a convenience for skipping
-		 * length checks on every continuation bytes. This works because 0xC0 is
-		 * always invalid in a UTF-8 string, meaning that if the string has been
-		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
-		 *
-		 * > [The following table] lists all of the byte sequences that are well-formed
-		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
-		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
-		 * > outside of the ranges listed is ill-formed.
-		 *
-		 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
-		 *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
-		 *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
-		 *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
-		 *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
-		 *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
-		 *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
-		 *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
-		 *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
-		 *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
-		 *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
-		 *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
-		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
-		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
-		 *
-		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
-		 */
-
-		// Valid two-byte code points.
-		$b1 = ord( $bytes[ $i ] );
-		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
-
-		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
-			++$count;
-			++$i;
-			continue;
-		}
-
-		// Valid three-byte code points.
-		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
-
-		if ( $b3 < 0x80 || $b3 > 0xBF ) {
-			goto invalid_utf8;
-		}
-
-		if (
-			( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
-			( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
-			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
-		) {
-			++$count;
-			$i += 2;
-
-			// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
-			if ( 0xEF === $b1 ) {
-				$has_noncharacters |= (
-					( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
-					( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
-				);
-			}
-
-			continue;
-		}
-
-		// Valid four-byte code points.
-		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
-
-		if ( $b4 < 0x80 || $b4 > 0xBF ) {
-			goto invalid_utf8;
-		}
-
-		if (
-			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
-			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
-		) {
-			++$count;
-			$i += 3;
-
-			// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
-			$has_noncharacters |= (
-				( 0x0F === ( $b2 & 0x0F ) ) &&
-				0xBF === $b3 &&
-				( 0xBE === $b4 || 0xBF === $b4 )
-			);
-
-			continue;
-		}
-
-		/**
-		 * When encountering invalid byte sequences, Unicode suggests finding the
-		 * maximal subpart of a text and replacing that subpart with a single
-		 * replacement character.
-		 *
-		 * > This practice is more secure because it does not result in the
-		 * > conversion consuming parts of valid sequences as though they were
-		 * > invalid. It also guarantees at least one replacement character will
-		 * > occur for each instance of an invalid sequence in the original text.
-		 * > Furthermore, this practice can be defined consistently for better
-		 * > interoperability between different implementations of conversion.
-		 *
-		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
-		 */
-		invalid_utf8:
-		$at             = $i;
-		$invalid_length = 1;
-
-		// Single-byte and two-byte characters.
-		if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
-			return $count;
-		}
-
-		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
-		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
-
-		// Find the maximal subpart and skip past it.
-		if ( 0xE0 === ( $b1 & 0xF0 ) ) {
-			// Three-byte characters.
-			$b2_valid = (
-				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
-				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
-				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
-			);
-
-			$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
-			return $count;
-		} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
-			// Four-byte characters.
-			$b2_valid = (
-				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
-				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
-				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
-			);
-
-			$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
-
-			$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
-			return $count;
-		}
-
-		return $count;
-	}
-
-	$at = $i;
-	return $count;
-}
-
-/**
- * Fallback mechanism for safely validating UTF-8 bytes.
- *
- * @since 6.9.0
- * @access private
- *
- * @see wp_is_valid_utf8()
- *
- * @param string $bytes String which might contain text encoded as UTF-8.
- * @return bool Whether the provided bytes can decode as valid UTF-8.
- */
-function _wp_is_valid_utf8_fallback( string $bytes ): bool {
-	$bytes_length = strlen( $bytes );
-	if ( 0 === $bytes_length ) {
-		return true;
-	}
-
-	$next_byte_at   = 0;
-	$invalid_length = 0;
-
-	_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
-
-	return $bytes_length === $next_byte_at && 0 === $invalid_length;
-}
-
-/**
- * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
- *
- * Example:
- *
- *     'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
- *
- * @since 6.9.0
- * @access private
- *
- * @see wp_scrub_utf8()
- *
- * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
- * @return string Input string with spans of invalid bytes swapped with the replacement character.
- */
-function _wp_scrub_utf8_fallback( string $bytes ): string {
-	$bytes_length   = strlen( $bytes );
-	$next_byte_at   = 0;
-	$was_at         = 0;
-	$invalid_length = 0;
-	$scrubbed       = '';
-
-	while ( $next_byte_at <= $bytes_length ) {
-		_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
-
-		if ( $next_byte_at >= $bytes_length ) {
-			if ( 0 === $was_at ) {
-				return $bytes;
-			}
-
-			return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
-		}
-
-		$scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
-		$scrubbed .= "\u{FFFD}";
-
-		$next_byte_at += $invalid_length;
-		$was_at        = $next_byte_at;
-	}
-
-	return $scrubbed;
-}
-
-/**
- * Returns how many code points are found in the given UTF-8 string.
- *
- * Invalid spans of bytes count as a single code point according
- * to the maximal subpart rule. This function is a fallback method
- * for calling `mb_strlen( $text, 'UTF-8' )`.
- *
- * When negative values are provided for the byte offsets or length,
- * this will always report zero code points.
- *
- * Example:
- *
- *     4  === _wp_utf8_codepoint_count( 'text' );
- *
- *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
- *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
- *
- * @since 6.9.0
- * @access private
- *
- * @param string $text            Count code points in this string.
- * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
- * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
- *                                Default is to scan until the end of the string. Must be positive.
- * @return int How many code points were found.
- */
-function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
-	if ( $byte_offset < 0 ) {
-		return 0;
-	}
-
-	$count           = 0;
-	$at              = $byte_offset;
-	$end             = strlen( $text );
-	$invalid_length  = 0;
-	$max_byte_length = min( $end - $at, $max_byte_length );
-
-	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
-		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
-		$count += $invalid_length > 0 ? 1 : 0;
-		$at    += $invalid_length;
-	}
-
-	return $count;
-}
-
-/**
- * Given a starting offset within a string and a maximum number of code points,
- * return how many bytes are occupied by the span of characters.
- *
- * Invalid spans of bytes count as a single code point according to the maximal
- * subpart rule. This function is a fallback method for calling
- * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
- *
- * @since 6.9.0
- * @access private
- *
- * @param string $text              Count bytes of span in this text.
- * @param int    $byte_offset       Start counting at this byte offset.
- * @param int    $max_code_points   Stop counting after this many code points have been seen,
- *                                  or at the end of the string.
- * @param ?int   $found_code_points Optional. Will be set to number of found code points in
- *                                  span, as this might be smaller than the maximum count if
- *                                  the string is not long enough.
- * @return int Number of bytes spanned by the code points.
- */
-function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
-	$was_at            = $byte_offset;
-	$invalid_length    = 0;
-	$end               = strlen( $text );
-	$found_code_points = 0;
-
-	while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
-		$needed      = $max_code_points - $found_code_points;
-		$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
-
-		$found_code_points += $chunk_count;
-
-		// Invalid spans only convey one code point count regardless of how long they are.
-		if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
-			++$found_code_points;
-			$byte_offset += $invalid_length;
-		}
-	}
-
-	return $byte_offset - $was_at;
-}
-
-/**
- * Fallback support for determining if a string contains Unicode noncharacters.
- *
- * @since 6.9.0
- * @access private
- *
- * @see \wp_has_noncharacters()
- *
- * @param string $text Are there noncharacters in this string?
- * @return bool Whether noncharacters were found in the string.
- */
-function _wp_has_noncharacters_fallback( string $text ): bool {
-	$at                = 0;
-	$invalid_length    = 0;
-	$has_noncharacters = false;
-	$end               = strlen( $text );
-
-	while ( $at < $end && ! $has_noncharacters ) {
-		_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
-		$at += $invalid_length;
-	}
-
-	return $has_noncharacters;
-}
-
-/**
- * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
- * with the deprecated function from the PHP standard library.
- *
- * @since 6.9.0
- * @access private
- *
- * @see \utf8_encode()
- *
- * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
- * @return string Text converted into UTF-8.
- */
-function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
-	$iso_8859_1_text = (string) $iso_8859_1_text;
-	$at              = 0;
-	$was_at          = 0;
-	$end             = strlen( $iso_8859_1_text );
-	$utf8            = '';
-
-	while ( $at < $end ) {
-		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
-		$ascii_byte_count = strspn(
-			$iso_8859_1_text,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$at
-		);
-
-		if ( $ascii_byte_count > 0 ) {
-			$at += $ascii_byte_count;
-			continue;
-		}
-
-		// All other bytes transform into two-byte UTF-8 sequences.
-		$code_point = ord( $iso_8859_1_text[ $at ] );
-		$byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
-		$byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
-
-		$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
-		$utf8 .= "{$byte1}{$byte2}";
-
-		++$at;
-		$was_at = $at;
-	}
-
-	if ( 0 === $was_at ) {
-		return $iso_8859_1_text;
-	}
-
-	$utf8 .= substr( $iso_8859_1_text, $was_at );
-	return $utf8;
-}
-
-/**
- * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
- * with the deprecated function from the PHP standard library.
- *
- * @since 6.9.0
- * @access private
- *
- * @see utf8_decode()
- *
- * @param string $utf8_text Text treated as UTF-8 bytes.
- * @return string Text converted into ISO-8859-1.
- */
-function _wp_utf8_decode_fallback( $utf8_text ) {
-	$utf8_text       = (string) $utf8_text;
-	$at              = 0;
-	$was_at          = 0;
-	$end             = strlen( $utf8_text );
-	$iso_8859_1_text = '';
-
-	while ( $at < $end ) {
-		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
-		$ascii_byte_count = strspn(
-			$utf8_text,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
-			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$at
-		);
-
-		if ( $ascii_byte_count > 0 ) {
-			$at += $ascii_byte_count;
-			continue;
-		}
-
-		$next_at        = $at;
-		$invalid_length = 0;
-		$found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
-		$span_length    = $next_at - $at;
-		$next_byte      = '?';
-
-		if ( 1 !== $found ) {
-			if ( $invalid_length > 0 ) {
-				$next_byte = '';
-				goto flush_sub_part;
-			}
-
-			break;
-		}
-
-		// All convertible code points are two-bytes long.
-		$byte1 = ord( $utf8_text[ $at ] );
-		if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
-			goto flush_sub_part;
-		}
-
-		// All convertible code points are not greater than U+FF.
-		$byte2      = ord( $utf8_text[ $at + 1 ] );
-		$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
-		if ( $code_point > 0xFF ) {
-			goto flush_sub_part;
-		}
-
-		$next_byte = chr( $code_point );
-
-		flush_sub_part:
-		$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
-		$iso_8859_1_text .= $next_byte;
-		$at              += $span_length;
-		$was_at           = $at;
-
-		if ( $invalid_length > 0 ) {
-			$iso_8859_1_text .= '?';
-			$at              += $invalid_length;
-			$was_at           = $at;
-		}
-	}
-
-	if ( 0 === $was_at ) {
-		return $utf8_text;
-	}
-
-	$iso_8859_1_text .= substr( $utf8_text, $was_at );
-	return $iso_8859_1_text;
-}
-
 /**
  * Indicates if a given byte stream represents valid UTF-8.
  *

From d6973cfa726e5b32aaaf36298e1fc6c329b9e1ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:46:35 +0100
Subject: [PATCH 19/28] Remove unused legacy utf8 pipeline functions

---
 components/Encoding/utf8-decoder.php | 87 ----------------------------
 1 file changed, 87 deletions(-)

diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index 7770440e..69440230 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -18,93 +18,6 @@
 	define( 'UTF8_DECODER_REJECT', 1 );
 }
 
-/**
- * Indicates if a given byte stream represents valid UTF-8.
- *
- * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
- *
- * Example:
- *
- *     true  === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
- *
- *     false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
- *     12    === $error_at;
- *
- *     false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
- *     23    === $error_at;
- *
- *     false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
- *     15    === $error_at;
- *
- * @param  string   $bytes  Text to validate as UTF-8 bytes.
- * @param  int      $starting_byte  Byte offset in string where decoding should begin.
- * @param  int|null $first_error_byte_at  Optional. If provided and byte stream fails to validate,
- *                                     will be set to the byte offset where the first invalid
- *                                     byte appeared. Otherwise, will not be set.
- *
- * @return bool Whether the given byte stream represents valid UTF-8.
- * @since {WP_VERSION}
- */
-function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, ?int &$first_error_byte_at = null ): bool {
-	$state         = UTF8_DECODER_ACCEPT;
-	$last_start_at = $starting_byte;
-
-	for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			$last_start_at = $at;
-		}
-
-		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
-	}
-
-	if ( UTF8_DECODER_ACCEPT === $state ) {
-		return true;
-	} else {
-		$first_error_byte_at = $last_start_at;
-
-		return false;
-	}
-}
-
-/**
- * Returns number of code points found within a UTF-8 string, similar to `strlen()`.
- *
- * If the byte stream fails to properly decode as UTF-8 this function will set the
- * byte index of the first error byte and report the number of decoded code points.
- *
- * @param  string   $bytes  Text for which to count code points.
- * @param  int|null $first_error_byte_at  Optional. If provided, will be set upon finding
- *                                     the first invalid byte.
- *
- * @return int How many code points were decoded in the given byte stream before an error
- *             or before reaching the end of the string.
- * @since {WP_VERSION}
- */
-function utf8_codepoint_count( string $bytes, ?int &$first_error_byte_at = null ): int {
-	$state         = UTF8_DECODER_ACCEPT;
-	$last_start_at = 0;
-	$count         = 0;
-	$codepoint     = 0;
-
-	for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			$last_start_at = $at;
-		}
-
-		$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint );
-
-		if ( UTF8_DECODER_ACCEPT === $state ) {
-			++$count;
-		}
-	}
-
-	if ( UTF8_DECODER_ACCEPT !== $state ) {
-		$first_error_byte_at = $last_start_at;
-	}
-
-	return $count;
-}
-
 /**
  * Inner loop for a number of UTF-8 decoding-related functions.
  *

From 05afd5d4098c10dbd9c52f8f1c398e52d100355f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 17:53:25 +0100
Subject: [PATCH 20/28] Invert the condition

---
 components/Blueprints/class-runner.php | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/components/Blueprints/class-runner.php b/components/Blueprints/class-runner.php
index ee8ea209..5678c849 100644
--- a/components/Blueprints/class-runner.php
+++ b/components/Blueprints/class-runner.php
@@ -379,9 +379,7 @@ private function load_blueprint() {
 		// Validate the Blueprint string we've just loaded.
 
 		// **UTF-8 Encoding:** Assert the Blueprint input is UTF-8 encoded.
-		$is_valid_utf8 = ! wp_is_valid_utf8( $blueprint_string );
-
-		if ( ! $is_valid_utf8 ) {
+		if ( ! wp_is_valid_utf8( $blueprint_string ) ) {
 			throw new BlueprintExecutionException( 'Blueprint must be encoded as UTF-8.' );
 		}
 

From a9de875c9beb2482336827c6a186a0b07f2e0028 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 18:06:38 +0100
Subject: [PATCH 21/28] Remove unused legacy utf8_substr function

---
 components/Encoding/utf8-decoder.php | 183 +++++++++------------------
 1 file changed, 62 insertions(+), 121 deletions(-)

diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index 69440230..e62ff5c5 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -18,127 +18,6 @@
 	define( 'UTF8_DECODER_REJECT', 1 );
 }
 
-/**
- * Inner loop for a number of UTF-8 decoding-related functions.
- *
- * You probably don't need this! This is highly-specific and optimized
- * code for UTF-8 operations used in other functions.
- *
- * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
- *
- * @since {WP_VERSION}
- *
- * @access private
- *
- * @param  string   $byte  Next byte to be applied in UTF-8 decoding or validation.
- * @param  int      $state  UTF-8 decoding state, one of the following values:<br><ul>
- *                                 <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
- *                                 <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
- *                                 Any other positive value: Decoder is waiting for additional bytes.
- * @param  int|null $codepoint  Optional. If provided, will accumulate the decoded code point as
- *                            each byte is processed. If not provided or unable to decode, will
- *                            not be set, or will be set to invalid and unusable data.
- *
- * @return int Next decoder state after processing the current byte.
- */
-function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 ): int {
-	/**
-	 * State classification and transition table for UTF-8 validation.
-	 *
-	 * > The first part of the table maps bytes to character classes that
-	 * > to reduce the size of the transition table and create bitmasks.
-	 * >
-	 * > The second part is a transition table that maps a combination
-	 * > of a state of the automaton and a character class to a state.
-	 *
-	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-	 */
-	static $state_table = (
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
-		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
-		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
-		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
-		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
-		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
-		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
-		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
-		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
-		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
-		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
-	);
-
-	$byte      = ord( $byte );
-	$type      = ord( $state_table[ $byte ] );
-	$codepoint = ( UTF8_DECODER_ACCEPT === $state )
-		? ( ( 0xFF >> $type ) & $byte )
-		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
-
-	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
-}
-
-/**
- * Extract a slice of a text by code point, where invalid byte sequences count
- * as a single code point, U+FFFD (the Unicode replacement character `�`).
- *
- * This function does not permit passing negative indices and will return
- * the original string if such are provide.
- *
- * @param  string $text  Input text from which to extract.
- * @param  int    $from  Start extracting after this many code-points.
- * @param  int    $length  Extract this many code points.
- *
- * @return string Extracted slice of input string.
- */
-function utf8_substr( string $text, int $from = 0, ?int $length = null ): string {
-	if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
-		return $text;
-	}
-
-	$position_in_input = 0;
-	$codepoint_at      = 0;
-	$end_byte          = strlen( $text );
-	$buffer            = '';
-	$seen_codepoints   = 0;
-	$sliced_codepoints = 0;
-	$decoder_state     = UTF8_DECODER_ACCEPT;
-
-	// Get to the start of the string.
-	while ( $position_in_input < $end_byte && $seen_codepoints < $length ) {
-		$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
-
-		if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
-			++$position_in_input;
-
-			if ( $seen_codepoints >= $from ) {
-				++$sliced_codepoints;
-				$buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at );
-			}
-
-			++$seen_codepoints;
-			$codepoint_at = $position_in_input;
-		} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
-			// "\u{FFFD}" is not supported in PHP 5.6.
-			$buffer .= "\xEF\xBF\xBD";
-
-			// Skip to the start of the next code point.
-			while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
-				$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
-			}
-
-			++$seen_codepoints;
-			$codepoint_at  = $position_in_input;
-			$decoder_state = UTF8_DECODER_ACCEPT;
-		} else {
-			++$position_in_input;
-		}
-	}
-
-	return $buffer;
-}
-
 /**
  * Extract a unicode codepoint from a specific offset in text.
  * Invalid byte sequences count as a single code point, U+FFFD
@@ -213,3 +92,65 @@ function utf8_ord( string $character ): int {
 
 	return $codepoint;
 }
+
+
+/**
+ * Inner loop for a number of UTF-8 decoding-related functions.
+ *
+ * You probably don't need this! This is highly-specific and optimized
+ * code for UTF-8 operations used in other functions.
+ *
+ * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ *
+ * @since {WP_VERSION}
+ *
+ * @access private
+ *
+ * @param  string   $byte  Next byte to be applied in UTF-8 decoding or validation.
+ * @param  int      $state  UTF-8 decoding state, one of the following values:<br><ul>
+ *                                 <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
+ *                                 <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
+ *                                 Any other positive value: Decoder is waiting for additional bytes.
+ * @param  int|null $codepoint  Optional. If provided, will accumulate the decoded code point as
+ *                            each byte is processed. If not provided or unable to decode, will
+ *                            not be set, or will be set to invalid and unusable data.
+ *
+ * @return int Next decoder state after processing the current byte.
+ */
+function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 ): int {
+	/**
+	 * State classification and transition table for UTF-8 validation.
+	 *
+	 * > The first part of the table maps bytes to character classes that
+	 * > to reduce the size of the transition table and create bitmasks.
+	 * >
+	 * > The second part is a transition table that maps a combination
+	 * > of a state of the automaton and a character class to a state.
+	 *
+	 * @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+	 */
+	static $state_table = (
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
+		"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
+		"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
+		"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
+		"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
+		"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
+		"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
+		"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
+		"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+	);
+
+	$byte      = ord( $byte );
+	$type      = ord( $state_table[ $byte ] );
+	$codepoint = ( UTF8_DECODER_ACCEPT === $state )
+		? ( ( 0xFF >> $type ) & $byte )
+		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
+
+	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
+}
\ No newline at end of file

From 049c8cb9f85db139d3c0ff4f4a9a8ff34c8aab1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 18:14:55 +0100
Subject: [PATCH 22/28] Replace utf8_codepoint_at with _wp_scan_utf8

---
 .../DataLiberation/URL/class-cssprocessor.php |  6 ++--
 components/Encoding/utf8-decoder.php          |  7 ++--
 components/XML/class-xmlprocessor.php         | 33 ++++++++++---------
 3 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
index 544f2dbc..f5ccdf63 100644
--- a/components/DataLiberation/URL/class-cssprocessor.php
+++ b/components/DataLiberation/URL/class-cssprocessor.php
@@ -4,8 +4,8 @@
 
 use function WordPress\Encoding\compat\_wp_scan_utf8;
 use function WordPress\Encoding\wp_scrub_utf8;
-use function WordPress\Encoding\utf8_codepoint_at;
 use function WordPress\Encoding\codepoint_to_utf8_bytes;
+use function WordPress\Encoding\utf8_ord;
 
 /**
  * Tokenizes CSS according to the CSS Syntax Level 3 specification.
@@ -1508,8 +1508,8 @@ private function consume_ident_start_codepoint( $at ): int {
 		}
 
 		$codepoint_byte_length = $new_at - $at;
-		$codepoint             = utf8_codepoint_at( $this->css, $at );
-		if ( null !== $codepoint && $codepoint >= 0x80 ) {
+		$codepoint = utf8_ord( substr( $this->css, $at, $codepoint_byte_length ) );
+		if ( $codepoint >= 0x80 ) {
 			return $codepoint_byte_length;
 		}
 		return 0;
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index e62ff5c5..bc041f8b 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -1,6 +1,7 @@
 <?php
 
 namespace WordPress\Encoding;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
 
 /*
  * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
@@ -33,8 +34,10 @@
  * @return int Unicode codepoint.
  */
 function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
-	if ( $byte_offset < 0 ) {
-		return null;
+	if ( 1 !== _wp_scan_utf8( $text, $byte_offset, $matched_bytes, null, 1 ) ) {
+
+		// "\u{FFFD}" is not supported in PHP 5.6.
+		$codepoint = utf8_ord( "\u{FFFD}" );
 	}
 
 	$position_in_input = $byte_offset;
diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 5c15f8eb..788d1b8b 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -5,7 +5,8 @@
 use WP_HTML_Span;
 use WP_HTML_Text_Replacement;
 
-use function WordPress\Encoding\utf8_codepoint_at;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
+use function WordPress\Encoding\utf8_ord;
 
 /**
  * XML API: XMLProcessor class
@@ -43,9 +44,9 @@
  *
  * @TODO: Support XML 1.1.
  *
- * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring
+ * @TODO: Evaluate the performance of _wp_scan_utf8() against using the mbstring
  *        extension. If mbstring is faster, then use it whenever it's available with
- *        utf8_codepoint_at() as a fallback.
+ *        _wp_scan_utf8() as a fallback.
  *
  * @package WordPress
  * @subpackage HTML-API
@@ -2348,27 +2349,29 @@ private function parse_name( $offset ) {
 			 *                  how to reliably reproduce this failure mode in a
 			 *                  unit test.
 			 *
-			 * Performance-wise, character-by-character processing via utf8_codepoint_at
+			 * Performance-wise, character-by-character processing via _wp_scan_utf8
 			 * is still much faster than relying on preg_match(). The mbstring extension
 			 * is likely faster. It would be interesting to evaluate the performance
 			 * and prefer mbstring whenever it's available.
 			 */
-			$codepoint = utf8_codepoint_at(
+			$bytes_parsed = _wp_scan_utf8(
 				$this->xml,
 				$offset + $name_byte_length,
-				$bytes_parsed
+				$invalid_length,
+				null,
+				1
 			);
-			if (
-				// Byte sequence is not a valid UTF-8 codepoint.
-				( 0xFFFD === $codepoint && 0 === $bytes_parsed ) ||
-				// No codepoint at the given offset.
-				null === $codepoint ||
-				// The codepoint is not a valid part of an XML NameChar or NameStartChar.
-				! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length )
-			) {
+			// EOF or invalid byte sequence.
+			if ( 1 !== $bytes_parsed ) {
+				break;
+			}
+
+			$codepoint = utf8_ord( substr( $this->xml, $offset + $name_byte_length, $bytes_parsed ) );
+
+			// The codepoint is not a valid part of an XML NameChar or NameStartChar.
+			if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
 				break;
 			}
-			$codepoint         = null;
 			$name_byte_length += $bytes_parsed;
 		}
 

From 8a4b939bb163d4d779496e6ef4428507d580a779 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 18:19:04 +0100
Subject: [PATCH 23/28] Pass $at by reference

---
 components/XML/class-xmlprocessor.php | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 788d1b8b..1b3c248d 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -2354,25 +2354,21 @@ private function parse_name( $offset ) {
 			 * is likely faster. It would be interesting to evaluate the performance
 			 * and prefer mbstring whenever it's available.
 			 */
-			$bytes_parsed = _wp_scan_utf8(
-				$this->xml,
-				$offset + $name_byte_length,
-				$invalid_length,
-				null,
-				1
-			);
-			// EOF or invalid byte sequence.
-			if ( 1 !== $bytes_parsed ) {
+			$at = $offset + $name_byte_length;
+			$new_at = $at;
+			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
+				// EOF or invalid utf-8 byte sequence.
 				break;
 			}
 
-			$codepoint = utf8_ord( substr( $this->xml, $offset + $name_byte_length, $bytes_parsed ) );
+			$codepoint_byte_length = $new_at - $at;
+			$codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) );
 
 			// The codepoint is not a valid part of an XML NameChar or NameStartChar.
 			if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
 				break;
 			}
-			$name_byte_length += $bytes_parsed;
+			$name_byte_length += $codepoint_byte_length;
 		}
 
 		return $name_byte_length;

From 0f159b46c7af2dde854ecdef6868757e829c84f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 18:24:01 +0100
Subject: [PATCH 24/28] Pass int as invalid_length

---
 components/XML/class-xmlprocessor.php | 1 +
 1 file changed, 1 insertion(+)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 1b3c248d..ffcca44c 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -2356,6 +2356,7 @@ private function parse_name( $offset ) {
 			 */
 			$at = $offset + $name_byte_length;
 			$new_at = $at;
+			$invalid_length = 0;
 			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
 				// EOF or invalid utf-8 byte sequence.
 				break;

From ec9187b4a24e1e98c47a185fe9d8114bb09287a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 18:28:56 +0100
Subject: [PATCH 25/28] Reorganize parse_name

---
 components/XML/class-xmlprocessor.php | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index ffcca44c..4ecbc21d 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -2329,8 +2329,8 @@ private function skip_whitespace() {
 	 * @return int
 	 */
 	private function parse_name( $offset ) {
-		static $i         = 0;
 		$name_byte_length = 0;
+		$at = $offset;
 		while ( true ) {
 			/**
 			 * Parse the next unicode codepoint.
@@ -2354,7 +2354,6 @@ private function parse_name( $offset ) {
 			 * is likely faster. It would be interesting to evaluate the performance
 			 * and prefer mbstring whenever it's available.
 			 */
-			$at = $offset + $name_byte_length;
 			$new_at = $at;
 			$invalid_length = 0;
 			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
@@ -2370,6 +2369,7 @@ private function parse_name( $offset ) {
 				break;
 			}
 			$name_byte_length += $codepoint_byte_length;
+			$at = $new_at;
 		}
 
 		return $name_byte_length;

From 2f5b37045da83319af83ed94485adfcf4c0f8626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 19:12:24 +0100
Subject: [PATCH 26/28] Restore utf8_codepoint_at-based processing

---
 components/Encoding/compat-utf8.php   |  4 +--
 components/Encoding/utf8-decoder.php  |  6 ----
 components/XML/class-xmlprocessor.php | 40 +++++++++++++--------------
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php
index d261c48d..a13e9e88 100644
--- a/components/Encoding/compat-utf8.php
+++ b/components/Encoding/compat-utf8.php
@@ -54,7 +54,7 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
 	$max_count         = $max_code_points ?? PHP_INT_MAX;
 	$has_noncharacters = false;
 
-	for ( $i = $at; $i < $end && $count < $max_count; $i++ ) {
+	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
 		/*
 		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
 		 *
@@ -564,4 +564,4 @@ function _wp_utf8_decode_fallback( $utf8_text ) {
 
 	$iso_8859_1_text .= substr( $utf8_text, $was_at );
 	return $iso_8859_1_text;
-}
+}
\ No newline at end of file
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index bc041f8b..954274f9 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -34,12 +34,6 @@
  * @return int Unicode codepoint.
  */
 function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) {
-	if ( 1 !== _wp_scan_utf8( $text, $byte_offset, $matched_bytes, null, 1 ) ) {
-
-		// "\u{FFFD}" is not supported in PHP 5.6.
-		$codepoint = utf8_ord( "\u{FFFD}" );
-	}
-
 	$position_in_input = $byte_offset;
 	$codepoint_at      = $byte_offset;
 	$end_byte          = strlen( $text );
diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
index 4ecbc21d..5c15f8eb 100644
--- a/components/XML/class-xmlprocessor.php
+++ b/components/XML/class-xmlprocessor.php
@@ -5,8 +5,7 @@
 use WP_HTML_Span;
 use WP_HTML_Text_Replacement;
 
-use function WordPress\Encoding\compat\_wp_scan_utf8;
-use function WordPress\Encoding\utf8_ord;
+use function WordPress\Encoding\utf8_codepoint_at;
 
 /**
  * XML API: XMLProcessor class
@@ -44,9 +43,9 @@
  *
  * @TODO: Support XML 1.1.
  *
- * @TODO: Evaluate the performance of _wp_scan_utf8() against using the mbstring
+ * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring
  *        extension. If mbstring is faster, then use it whenever it's available with
- *        _wp_scan_utf8() as a fallback.
+ *        utf8_codepoint_at() as a fallback.
  *
  * @package WordPress
  * @subpackage HTML-API
@@ -2329,8 +2328,8 @@ private function skip_whitespace() {
 	 * @return int
 	 */
 	private function parse_name( $offset ) {
+		static $i         = 0;
 		$name_byte_length = 0;
-		$at = $offset;
 		while ( true ) {
 			/**
 			 * Parse the next unicode codepoint.
@@ -2349,27 +2348,28 @@ private function parse_name( $offset ) {
 			 *                  how to reliably reproduce this failure mode in a
 			 *                  unit test.
 			 *
-			 * Performance-wise, character-by-character processing via _wp_scan_utf8
+			 * Performance-wise, character-by-character processing via utf8_codepoint_at
 			 * is still much faster than relying on preg_match(). The mbstring extension
 			 * is likely faster. It would be interesting to evaluate the performance
 			 * and prefer mbstring whenever it's available.
 			 */
-			$new_at = $at;
-			$invalid_length = 0;
-			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
-				// EOF or invalid utf-8 byte sequence.
-				break;
-			}
-
-			$codepoint_byte_length = $new_at - $at;
-			$codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) );
-
-			// The codepoint is not a valid part of an XML NameChar or NameStartChar.
-			if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
+			$codepoint = utf8_codepoint_at(
+				$this->xml,
+				$offset + $name_byte_length,
+				$bytes_parsed
+			);
+			if (
+				// Byte sequence is not a valid UTF-8 codepoint.
+				( 0xFFFD === $codepoint && 0 === $bytes_parsed ) ||
+				// No codepoint at the given offset.
+				null === $codepoint ||
+				// The codepoint is not a valid part of an XML NameChar or NameStartChar.
+				! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length )
+			) {
 				break;
 			}
-			$name_byte_length += $codepoint_byte_length;
-			$at = $new_at;
+			$codepoint         = null;
+			$name_byte_length += $bytes_parsed;
 		}
 
 		return $name_byte_length;

From e3275162ee6367817a18c822b30322d3c3b7a7b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 19:25:15 +0100
Subject: [PATCH 27/28] Remove unnecessary changes to CSSProcessor

---
 .../DataLiberation/URL/class-cssprocessor.php    | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
index f5ccdf63..7658be5a 100644
--- a/components/DataLiberation/URL/class-cssprocessor.php
+++ b/components/DataLiberation/URL/class-cssprocessor.php
@@ -2,10 +2,10 @@
 
 namespace WordPress\DataLiberation\URL;
 
+use function WordPress\Encoding\utf8_codepoint_at;
+use function WordPress\Encoding\codepoint_to_utf8_bytes;
 use function WordPress\Encoding\compat\_wp_scan_utf8;
 use function WordPress\Encoding\wp_scrub_utf8;
-use function WordPress\Encoding\codepoint_to_utf8_bytes;
-use function WordPress\Encoding\utf8_ord;
 
 /**
  * Tokenizes CSS according to the CSS Syntax Level 3 specification.
@@ -1475,7 +1475,7 @@ private function consume_ident_codepoint( $at ): int {
 	 * @return int The number of bytes consumed.
 	 */
 	private function consume_ident_start_codepoint( $at ): int {
-		if ( $at >= $this->length ) {
+		if ( $at > $this->length ) {
 			return 0;
 		}
 
@@ -1500,16 +1500,14 @@ private function consume_ident_start_codepoint( $at ): int {
 			 *
 			 * We'll move forward by $invalid_length bytes and continue processing.
 			 * Later on, during the string decoding, we'll replace the invalid bytes with U+FFFD
-			 * via maximal subpart"replacement.
-			 *
-			 * Ensure we always return at least 1 byte to avoid infinite loops.
+			 * via maximal subpart”replacement.
 			 */
-			return max( 1, $invalid_length );
+			return $invalid_length;
 		}
 
 		$codepoint_byte_length = $new_at - $at;
-		$codepoint = utf8_ord( substr( $this->css, $at, $codepoint_byte_length ) );
-		if ( $codepoint >= 0x80 ) {
+		$codepoint             = utf8_codepoint_at( $this->css, $at );
+		if ( null !== $codepoint && $codepoint >= 0x80 ) {
 			return $codepoint_byte_length;
 		}
 		return 0;

From 27bf87fd95b6e49629c54b4b82b49c74e8e7091c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= <adam@adamziel.com>
Date: Sat, 1 Nov 2025 19:28:33 +0100
Subject: [PATCH 28/28] PHPCS

---
 components/Encoding/compat-utf8.php  | 2 +-
 components/Encoding/utf8-decoder.php | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php
index a13e9e88..89dafb5c 100644
--- a/components/Encoding/compat-utf8.php
+++ b/components/Encoding/compat-utf8.php
@@ -564,4 +564,4 @@ function _wp_utf8_decode_fallback( $utf8_text ) {
 
 	$iso_8859_1_text .= substr( $utf8_text, $was_at );
 	return $iso_8859_1_text;
-}
\ No newline at end of file
+}
diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php
index 954274f9..ddcc4acc 100644
--- a/components/Encoding/utf8-decoder.php
+++ b/components/Encoding/utf8-decoder.php
@@ -1,6 +1,7 @@
 <?php
 
 namespace WordPress\Encoding;
+
 use function WordPress\Encoding\compat\_wp_scan_utf8;
 
 /*
@@ -150,4 +151,4 @@ function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0
 		: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
 
 	return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
-}
\ No newline at end of file
+}