diff --git a/components/Blueprints/class-runner.php b/components/Blueprints/class-runner.php index 4d7ab908..5678c849 100644 --- a/components/Blueprints/class-runner.php +++ b/components/Blueprints/class-runner.php @@ -55,7 +55,7 @@ use WordPress\HttpClient\Client; use WordPress\Zip\ZipFilesystem; -use function WordPress\Encoding\_wp_has_noncharacters_fallback; +use function WordPress\Encoding\wp_is_valid_utf8; use function WordPress\Filesystem\wp_unix_sys_get_temp_dir; use function WordPress\Zip\is_zip_file_stream; @@ -379,14 +379,7 @@ private function load_blueprint() { // Validate the Blueprint string we've just loaded. // **UTF-8 Encoding:** Assert the Blueprint input is UTF-8 encoded. - $is_valid_utf8 = false; - if ( function_exists( 'mb_check_encoding' ) ) { - $is_valid_utf8 = mb_check_encoding( $blueprint_string, 'UTF-8' ); - } else { - $is_valid_utf8 = ! _wp_has_noncharacters_fallback( $blueprint_string ); - } - - if ( ! $is_valid_utf8 ) { + if ( ! wp_is_valid_utf8( $blueprint_string ) ) { throw new BlueprintExecutionException( 'Blueprint must be encoded as UTF-8.' ); } diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php index a7a1afcb..7658be5a 100644 --- a/components/DataLiberation/URL/class-cssprocessor.php +++ b/components/DataLiberation/URL/class-cssprocessor.php @@ -2,10 +2,10 @@ namespace WordPress\DataLiberation\URL; -use function WordPress\Encoding\_wp_scan_utf8; -use function WordPress\Encoding\_wp_scrub_utf8_fallback; use function WordPress\Encoding\utf8_codepoint_at; use function WordPress\Encoding\codepoint_to_utf8_bytes; +use function WordPress\Encoding\compat\_wp_scan_utf8; +use function WordPress\Encoding\wp_scrub_utf8; /** * Tokenizes CSS according to the CSS Syntax Level 3 specification. @@ -1528,7 +1528,7 @@ private function consume_ident_start_codepoint( $at ): int { */ private function decode_string_or_url( int $start, int $length ): string { // Fast path: check if any processing is needed. - $slice = _wp_scrub_utf8_fallback( substr( $this->css, $start, $length ) ); + $slice = wp_scrub_utf8( substr( $this->css, $start, $length ) ); $special_chars = "\\\r\f\x00"; if ( false === strpbrk( $slice, $special_chars ) ) { // No special chars - return raw substring (almost zero allocations). diff --git a/components/Encoding/compat-utf8.php b/components/Encoding/compat-utf8.php new file mode 100644 index 00000000..89dafb5c --- /dev/null +++ b/components/Encoding/compat-utf8.php @@ -0,0 +1,567 @@ +?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", + $i, + $end - $i + ); + + if ( $count + $ascii_byte_count >= $max_count ) { + $at = $i + ( $max_count - $count ); + $count = $max_count; + return $count; + } + + $count += $ascii_byte_count; + $i += $ascii_byte_count; + + if ( $i >= $end ) { + $at = $end; + return $count; + } + + /** + * The above fast-track handled all single-byte UTF-8 characters. What + * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8. + * + * Therefore everything past here is checking those multibyte sequences. + * + * It may look like there’s a need to check against the max bytes here, + * but since each match of a single character returns, this functions will + * bail already if crossing the max-bytes threshold. This function SHALL + * NOT return in the middle of a multi-byte character, so if a character + * falls on each side of the max bytes, the entire character will be scanned. + * + * Because it’s possible that there are truncated characters, the use of + * the null-coalescing operator with "\xC0" is a convenience for skipping + * length checks on every continuation bytes. This works because 0xC0 is + * always invalid in a UTF-8 string, meaning that if the string has been + * truncated, it will find 0xC0 and reject as invalid UTF-8. + * + * > [The following table] lists all of the byte sequences that are well-formed + * > in UTF-8. A range of byte values such as A0..BF indicates that any byte + * > from A0 to BF (inclusive) is well-formed in that position. Any byte value + * > outside of the ranges listed is ill-formed. + * + * > Table 3-7. Well-Formed UTF-8 Byte Sequences + * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮ + * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │ + * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤ + * │ U+0000..U+007F │ 00..7F │ │ │ │ + * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │ + * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │ + * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │ + * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │ + * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │ + * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │ + * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │ + * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │ + * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯ + * + * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 + */ + + // Valid two-byte code points. + $b1 = ord( $bytes[ $i ] ); + $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); + + if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { + ++$count; + ++$i; + continue; + } + + // Valid three-byte code points. + $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); + + if ( $b3 < 0x80 || $b3 > 0xBF ) { + goto invalid_utf8; + } + + if ( + ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || + ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || + ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) + ) { + ++$count; + $i += 2; + + // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF. + if ( 0xEF === $b1 ) { + $has_noncharacters |= ( + ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) || + ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) ) + ); + } + + continue; + } + + // Valid four-byte code points. + $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" ); + + if ( $b4 < 0x80 || $b4 > 0xBF ) { + goto invalid_utf8; + } + + if ( + ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || + ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) + ) { + ++$count; + $i += 3; + + // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF. + $has_noncharacters |= ( + ( 0x0F === ( $b2 & 0x0F ) ) && + 0xBF === $b3 && + ( 0xBE === $b4 || 0xBF === $b4 ) + ); + + continue; + } + + /** + * When encountering invalid byte sequences, Unicode suggests finding the + * maximal subpart of a text and replacing that subpart with a single + * replacement character. + * + * > This practice is more secure because it does not result in the + * > conversion consuming parts of valid sequences as though they were + * > invalid. It also guarantees at least one replacement character will + * > occur for each instance of an invalid sequence in the original text. + * > Furthermore, this practice can be defined consistently for better + * > interoperability between different implementations of conversion. + * + * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630 + */ + invalid_utf8: + $at = $i; + $invalid_length = 1; + + // Single-byte and two-byte characters. + if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) { + return $count; + } + + $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); + $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); + + // Find the maximal subpart and skip past it. + if ( 0xE0 === ( $b1 & 0xF0 ) ) { + // Three-byte characters. + $b2_valid = ( + ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || + ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || + ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) + ); + + $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 ); + return $count; + } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) { + // Four-byte characters. + $b2_valid = ( + ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || + ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || + ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) + ); + + $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF; + + $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 ); + return $count; + } + + return $count; + } + + $at = $i; + return $count; +} + +/** + * Fallback mechanism for safely validating UTF-8 bytes. + * + * @since 6.9.0 + * @access private + * + * @see wp_is_valid_utf8() + * + * @param string $bytes String which might contain text encoded as UTF-8. + * @return bool Whether the provided bytes can decode as valid UTF-8. + */ +function _wp_is_valid_utf8_fallback( string $bytes ): bool { + $bytes_length = strlen( $bytes ); + if ( 0 === $bytes_length ) { + return true; + } + + $next_byte_at = 0; + $invalid_length = 0; + + _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); + + return $bytes_length === $next_byte_at && 0 === $invalid_length; +} + +/** + * Fallback mechanism for replacing invalid spans of UTF-8 bytes. + * + * Example: + * + * 'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252. + * + * @since 6.9.0 + * @access private + * + * @see wp_scrub_utf8() + * + * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes. + * @return string Input string with spans of invalid bytes swapped with the replacement character. + */ +function _wp_scrub_utf8_fallback( string $bytes ): string { + $bytes_length = strlen( $bytes ); + $next_byte_at = 0; + $was_at = 0; + $invalid_length = 0; + $scrubbed = ''; + + while ( $next_byte_at <= $bytes_length ) { + _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); + + if ( $next_byte_at >= $bytes_length ) { + if ( 0 === $was_at ) { + return $bytes; + } + + return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length ); + } + + $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at ); + $scrubbed .= "\u{FFFD}"; + + $next_byte_at += $invalid_length; + $was_at = $next_byte_at; + } + + return $scrubbed; +} + +/** + * Returns how many code points are found in the given UTF-8 string. + * + * Invalid spans of bytes count as a single code point according + * to the maximal subpart rule. This function is a fallback method + * for calling `mb_strlen( $text, 'UTF-8' )`. + * + * When negative values are provided for the byte offsets or length, + * this will always report zero code points. + * + * Example: + * + * 4 === _wp_utf8_codepoint_count( 'text' ); + * + * // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'. + * 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" ); + * + * @since 6.9.0 + * @access private + * + * @param string $text Count code points in this string. + * @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive. + * @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes. + * Default is to scan until the end of the string. Must be positive. + * @return int How many code points were found. + */ +function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { + if ( $byte_offset < 0 ) { + return 0; + } + + $count = 0; + $at = $byte_offset; + $end = strlen( $text ); + $invalid_length = 0; + $max_byte_length = min( $end - $at, $max_byte_length ); + + while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) { + $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) ); + $count += $invalid_length > 0 ? 1 : 0; + $at += $invalid_length; + } + + return $count; +} + +/** + * Given a starting offset within a string and a maximum number of code points, + * return how many bytes are occupied by the span of characters. + * + * Invalid spans of bytes count as a single code point according to the maximal + * subpart rule. This function is a fallback method for calling + * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`. + * + * @since 6.9.0 + * @access private + * + * @param string $text Count bytes of span in this text. + * @param int $byte_offset Start counting at this byte offset. + * @param int $max_code_points Stop counting after this many code points have been seen, + * or at the end of the string. + * @param ?int $found_code_points Optional. Will be set to number of found code points in + * span, as this might be smaller than the maximum count if + * the string is not long enough. + * @return int Number of bytes spanned by the code points. + */ +function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { + $was_at = $byte_offset; + $invalid_length = 0; + $end = strlen( $text ); + $found_code_points = 0; + + while ( $byte_offset < $end && $found_code_points < $max_code_points ) { + $needed = $max_code_points - $found_code_points; + $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed ); + + $found_code_points += $chunk_count; + + // Invalid spans only convey one code point count regardless of how long they are. + if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) { + ++$found_code_points; + $byte_offset += $invalid_length; + } + } + + return $byte_offset - $was_at; +} + +/** + * Fallback support for determining if a string contains Unicode noncharacters. + * + * @since 6.9.0 + * @access private + * + * @see \wp_has_noncharacters() + * + * @param string $text Are there noncharacters in this string? + * @return bool Whether noncharacters were found in the string. + */ +function _wp_has_noncharacters_fallback( string $text ): bool { + $at = 0; + $invalid_length = 0; + $has_noncharacters = false; + $end = strlen( $text ); + + while ( $at < $end && ! $has_noncharacters ) { + _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters ); + $at += $invalid_length; + } + + return $has_noncharacters; +} + +/** + * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility + * with the deprecated function from the PHP standard library. + * + * @since 6.9.0 + * @access private + * + * @see \utf8_encode() + * + * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes. + * @return string Text converted into UTF-8. + */ +function _wp_utf8_encode_fallback( $iso_8859_1_text ) { + $iso_8859_1_text = (string) $iso_8859_1_text; + $at = 0; + $was_at = 0; + $end = strlen( $iso_8859_1_text ); + $utf8 = ''; + + while ( $at < $end ) { + // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F. + $ascii_byte_count = strspn( + $iso_8859_1_text, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . + " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", + $at + ); + + if ( $ascii_byte_count > 0 ) { + $at += $ascii_byte_count; + continue; + } + + // All other bytes transform into two-byte UTF-8 sequences. + $code_point = ord( $iso_8859_1_text[ $at ] ); + $byte1 = chr( 0xC0 | ( $code_point >> 6 ) ); + $byte2 = chr( 0x80 | ( $code_point & 0x3F ) ); + + $utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at ); + $utf8 .= "{$byte1}{$byte2}"; + + ++$at; + $was_at = $at; + } + + if ( 0 === $was_at ) { + return $iso_8859_1_text; + } + + $utf8 .= substr( $iso_8859_1_text, $was_at ); + return $utf8; +} + +/** + * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility + * with the deprecated function from the PHP standard library. + * + * @since 6.9.0 + * @access private + * + * @see \utf8_decode() + * + * @param string $utf8_text Text treated as UTF-8 bytes. + * @return string Text converted into ISO-8859-1. + */ +function _wp_utf8_decode_fallback( $utf8_text ) { + $utf8_text = (string) $utf8_text; + $at = 0; + $was_at = 0; + $end = strlen( $utf8_text ); + $iso_8859_1_text = ''; + + while ( $at < $end ) { + // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F. + $ascii_byte_count = strspn( + $utf8_text, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . + " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", + $at + ); + + if ( $ascii_byte_count > 0 ) { + $at += $ascii_byte_count; + continue; + } + + $next_at = $at; + $invalid_length = 0; + $found = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 ); + $span_length = $next_at - $at; + $next_byte = '?'; + + if ( 1 !== $found ) { + if ( $invalid_length > 0 ) { + $next_byte = ''; + goto flush_sub_part; + } + + break; + } + + // All convertible code points are two-bytes long. + $byte1 = ord( $utf8_text[ $at ] ); + if ( 0xC0 !== ( $byte1 & 0xE0 ) ) { + goto flush_sub_part; + } + + // All convertible code points are not greater than U+FF. + $byte2 = ord( $utf8_text[ $at + 1 ] ); + $code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) ); + if ( $code_point > 0xFF ) { + goto flush_sub_part; + } + + $next_byte = chr( $code_point ); + + flush_sub_part: + $iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at ); + $iso_8859_1_text .= $next_byte; + $at += $span_length; + $was_at = $at; + + if ( $invalid_length > 0 ) { + $iso_8859_1_text .= '?'; + $at += $invalid_length; + $was_at = $at; + } + } + + if ( 0 === $was_at ) { + return $utf8_text; + } + + $iso_8859_1_text .= substr( $utf8_text, $was_at ); + return $iso_8859_1_text; +} diff --git a/components/Encoding/composer.json b/components/Encoding/composer.json index 1bf6c1da..187b7559 100644 --- a/components/Encoding/composer.json +++ b/components/Encoding/composer.json @@ -1,35 +1,37 @@ { - "name": "wp-php-toolkit/encoding", - "description": "Encoding component for WordPress.", - "type": "library", - "license": "GPL-2.0-or-later", - "authors": [ - { - "name": "Adam Zielinski", - "email": "adam@adamziel.com" - }, - { - "name": "WordPress Team", - "email": "wordpress@wordpress.org" - } - ], - "require": { - "php": ">=7.2" - }, - "autoload": { - "files": [ - "utf8-decoder.php", - "utf8-encoder.php" - ], - "exclude-from-classmap": [ - "/Tests/" - ] - }, - "archive": { - "exclude": [ - "**/.github/", - "**/Tests/", - "**/bin/" - ] - } + "name": "wp-php-toolkit/encoding", + "description": "Encoding component for WordPress.", + "type": "library", + "license": "GPL-2.0-or-later", + "authors": [ + { + "name": "Adam Zielinski", + "email": "adam@adamziel.com" + }, + { + "name": "WordPress Team", + "email": "wordpress@wordpress.org" + } + ], + "require": { + "php": ">=7.2" + }, + "autoload": { + "files": [ + "utf8.php", + "compat-utf8.php", + "utf8-encoder.php", + "utf8-decoder.php" + ], + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "archive": { + "exclude": [ + "**/.github/", + "**/Tests/", + "**/bin/" + ] + } } diff --git a/components/Encoding/utf8-decoder.php b/components/Encoding/utf8-decoder.php index e339e710..ddcc4acc 100644 --- a/components/Encoding/utf8-decoder.php +++ b/components/Encoding/utf8-decoder.php @@ -2,6 +2,8 @@ namespace WordPress\Encoding; +use function WordPress\Encoding\compat\_wp_scan_utf8; + /* * UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally * proposed in https://github.com/WordPress/wordpress-develop/pull/6883. @@ -19,655 +21,76 @@ } /** - * Finds spans of valid and invalid UTF-8 bytes in a given string. - * - * This is a low-level tool to power various UTF-8 functionality. - * It scans through a string until it finds invalid byte spans. - * When it does this, it does three things: - * - * - Assigns `$at` to the position after the last successful code point. - * - Assigns `$invalid_length` to the length of the maximal subpart of - * the invalid bytes starting at `$at`. - * - Returns how many code points were successfully scanned. - * - * This information is enough to build a number of useful UTF-8 functions. - * - * Example: - * - * // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1. - * "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' ); - * $at = $invalid_length = 0; - * - * // The first step finds the invalid 0xF1 byte. - * 2 === _wp_scan_utf8( $pineapple, $at, $invalid_length ); - * $at === 2; $invalid_length === 1; - * - * // The second step continues to the end of the string. - * 1 === _wp_scan_utf8( $pineapple, $at, $invalid_length ); - * $at === 4; $invalid_length === 0; - * - * Note! While passing an options array here might be convenient from a calling-code standpoint, - * this function is intended to serve as a very low-level foundation upon which to build - * higher level functionality. For the sake of keeping costs explicit all arguments are - * passed directly. - * - * @since 6.9.0 - * @access private - * - * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes. - * @param int $at Where to start scanning. - * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`. - * @param int|null $max_bytes Stop scanning after this many bytes have been seen. - * @param int|null $max_code_points Stop scanning after this many code points have been seen. - * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters. - * @return int How many code points were successfully scanned. - */ -function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { - $byte_length = strlen( $bytes ); - $end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) ); - $invalid_length = 0; - $count = 0; - $max_count = $max_code_points ?? PHP_INT_MAX; - $has_noncharacters = false; - - for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) { - /* - * Quickly skip past US-ASCII bytes, all of which are valid UTF-8. - * - * This optimization step improves the speed from 10x to 100x - * depending on whether the JIT has optimized the function. - */ - $ascii_byte_count = strspn( - $bytes, - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . - " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", - $i, - $end - $i - ); - - if ( $count + $ascii_byte_count >= $max_count ) { - $at = $i + ( $max_count - $count ); - $count = $max_count; - return $count; - } - - $count += $ascii_byte_count; - $i += $ascii_byte_count; - - if ( $i >= $end ) { - $at = $end; - return $count; - } - - /** - * The above fast-track handled all single-byte UTF-8 characters. What - * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8. - * - * Therefore everything past here is checking those multibyte sequences. - * - * It may look like there’s a need to check against the max bytes here, - * but since each match of a single character returns, this functions will - * bail already if crossing the max-bytes threshold. This function SHALL - * NOT return in the middle of a multi-byte character, so if a character - * falls on each side of the max bytes, the entire character will be scanned. - * - * Because it’s possible that there are truncated characters, the use of - * the null-coalescing operator with "\xC0" is a convenience for skipping - * length checks on every continuation bytes. This works because 0xC0 is - * always invalid in a UTF-8 string, meaning that if the string has been - * truncated, it will find 0xC0 and reject as invalid UTF-8. - * - * > [The following table] lists all of the byte sequences that are well-formed - * > in UTF-8. A range of byte values such as A0..BF indicates that any byte - * > from A0 to BF (inclusive) is well-formed in that position. Any byte value - * > outside of the ranges listed is ill-formed. - * - * > Table 3-7. Well-Formed UTF-8 Byte Sequences - * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮ - * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │ - * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤ - * │ U+0000..U+007F │ 00..7F │ │ │ │ - * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │ - * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │ - * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │ - * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │ - * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │ - * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │ - * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │ - * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │ - * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯ - * - * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 - */ - - // Valid two-byte code points. - $b1 = ord( $bytes[ $i ] ); - $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); - - if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { - ++$count; - ++$i; - continue; - } - - // Valid three-byte code points. - $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); - - if ( $b3 < 0x80 || $b3 > 0xBF ) { - goto invalid_utf8; - } - - if ( - ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || - ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || - ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || - ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) - ) { - ++$count; - $i += 2; - - // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF. - if ( 0xEF === $b1 ) { - $has_noncharacters |= ( - ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) || - ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) ) - ); - } - - continue; - } - - // Valid four-byte code points. - $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" ); - - if ( $b4 < 0x80 || $b4 > 0xBF ) { - goto invalid_utf8; - } - - if ( - ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || - ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || - ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) - ) { - ++$count; - $i += 3; - - // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF. - $has_noncharacters |= ( - ( 0x0F === ( $b2 & 0x0F ) ) && - 0xBF === $b3 && - ( 0xBE === $b4 || 0xBF === $b4 ) - ); - - continue; - } - - /** - * When encountering invalid byte sequences, Unicode suggests finding the - * maximal subpart of a text and replacing that subpart with a single - * replacement character. - * - * > This practice is more secure because it does not result in the - * > conversion consuming parts of valid sequences as though they were - * > invalid. It also guarantees at least one replacement character will - * > occur for each instance of an invalid sequence in the original text. - * > Furthermore, this practice can be defined consistently for better - * > interoperability between different implementations of conversion. - * - * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630 - */ - invalid_utf8: - $at = $i; - $invalid_length = 1; - - // Single-byte and two-byte characters. - if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) { - return $count; - } - - $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); - $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); - - // Find the maximal subpart and skip past it. - if ( 0xE0 === ( $b1 & 0xF0 ) ) { - // Three-byte characters. - $b2_valid = ( - ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || - ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || - ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || - ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) - ); - - $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 ); - return $count; - } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) { - // Four-byte characters. - $b2_valid = ( - ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || - ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || - ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) - ); - - $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF; - - $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 ); - return $count; - } - - return $count; - } - - $at = $i; - return $count; -} - -/** - * Fallback mechanism for safely validating UTF-8 bytes. - * - * @since 6.9.0 - * @access private - * - * @see wp_is_valid_utf8() - * - * @param string $bytes String which might contain text encoded as UTF-8. - * @return bool Whether the provided bytes can decode as valid UTF-8. - */ -function _wp_is_valid_utf8_fallback( string $bytes ): bool { - $bytes_length = strlen( $bytes ); - if ( 0 === $bytes_length ) { - return true; - } - - $next_byte_at = 0; - $invalid_length = 0; - - _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); - - return $bytes_length === $next_byte_at && 0 === $invalid_length; -} - -/** - * Fallback mechanism for replacing invalid spans of UTF-8 bytes. - * - * Example: - * - * 'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252. - * - * @since 6.9.0 - * @access private - * - * @see wp_scrub_utf8() - * - * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes. - * @return string Input string with spans of invalid bytes swapped with the replacement character. - */ -function _wp_scrub_utf8_fallback( string $bytes ): string { - $bytes_length = strlen( $bytes ); - $next_byte_at = 0; - $was_at = 0; - $invalid_length = 0; - $scrubbed = ''; - - while ( $next_byte_at <= $bytes_length ) { - _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length ); - - if ( $next_byte_at >= $bytes_length ) { - if ( 0 === $was_at ) { - return $bytes; - } - - return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length ); - } - - $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at ); - $scrubbed .= "\u{FFFD}"; - - $next_byte_at += $invalid_length; - $was_at = $next_byte_at; - } - - return $scrubbed; -} - -/** - * Returns how many code points are found in the given UTF-8 string. - * - * Invalid spans of bytes count as a single code point according - * to the maximal subpart rule. This function is a fallback method - * for calling `mb_strlen( $text, 'UTF-8' )`. - * - * When negative values are provided for the byte offsets or length, - * this will always report zero code points. - * - * Example: - * - * 4 === _wp_utf8_codepoint_count( 'text' ); - * - * // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'. - * 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" ); - * - * @since 6.9.0 - * @access private - * - * @param string $text Count code points in this string. - * @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive. - * @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes. - * Default is to scan until the end of the string. Must be positive. - * @return int How many code points were found. - */ -function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int { - if ( $byte_offset < 0 ) { - return 0; - } - - $count = 0; - $at = $byte_offset; - $end = strlen( $text ); - $invalid_length = 0; - $max_byte_length = min( $end - $at, $max_byte_length ); - - while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) { - $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) ); - $count += $invalid_length > 0 ? 1 : 0; - $at += $invalid_length; - } - - return $count; -} - -/** - * Given a starting offset within a string and a maximum number of code points, - * return how many bytes are occupied by the span of characters. - * - * Invalid spans of bytes count as a single code point according to the maximal - * subpart rule. This function is a fallback method for calling - * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`. - * - * @since 6.9.0 - * @access private - * - * @param string $text Count bytes of span in this text. - * @param int $byte_offset Start counting at this byte offset. - * @param int $max_code_points Stop counting after this many code points have been seen, - * or at the end of the string. - * @param ?int $found_code_points Optional. Will be set to number of found code points in - * span, as this might be smaller than the maximum count if - * the string is not long enough. - * @return int Number of bytes spanned by the code points. - */ -function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int { - $was_at = $byte_offset; - $invalid_length = 0; - $end = strlen( $text ); - $found_code_points = 0; - - while ( $byte_offset < $end && $found_code_points < $max_code_points ) { - $needed = $max_code_points - $found_code_points; - $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed ); - - $found_code_points += $chunk_count; - - // Invalid spans only convey one code point count regardless of how long they are. - if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) { - ++$found_code_points; - $byte_offset += $invalid_length; - } - } - - return $byte_offset - $was_at; -} - -/** - * Fallback support for determining if a string contains Unicode noncharacters. - * - * @since 6.9.0 - * @access private - * - * @see \wp_has_noncharacters() - * - * @param string $text Are there noncharacters in this string? - * @return bool Whether noncharacters were found in the string. - */ -function _wp_has_noncharacters_fallback( string $text ): bool { - $at = 0; - $invalid_length = 0; - $has_noncharacters = false; - $end = strlen( $text ); - - while ( $at < $end && ! $has_noncharacters ) { - _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters ); - $at += $invalid_length; - } - - return $has_noncharacters; -} - -/** - * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility - * with the deprecated function from the PHP standard library. - * - * @since 6.9.0 - * @access private - * - * @see \utf8_encode() - * - * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes. - * @return string Text converted into UTF-8. - */ -function _wp_utf8_encode_fallback( $iso_8859_1_text ) { - $iso_8859_1_text = (string) $iso_8859_1_text; - $at = 0; - $was_at = 0; - $end = strlen( $iso_8859_1_text ); - $utf8 = ''; - - while ( $at < $end ) { - // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F. - $ascii_byte_count = strspn( - $iso_8859_1_text, - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . - " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", - $at - ); - - if ( $ascii_byte_count > 0 ) { - $at += $ascii_byte_count; - continue; - } - - // All other bytes transform into two-byte UTF-8 sequences. - $code_point = ord( $iso_8859_1_text[ $at ] ); - $byte1 = chr( 0xC0 | ( $code_point >> 6 ) ); - $byte2 = chr( 0x80 | ( $code_point & 0x3F ) ); - - $utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at ); - $utf8 .= "{$byte1}{$byte2}"; - - ++$at; - $was_at = $at; - } - - if ( 0 === $was_at ) { - return $iso_8859_1_text; - } - - $utf8 .= substr( $iso_8859_1_text, $was_at ); - return $utf8; -} - -/** - * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility - * with the deprecated function from the PHP standard library. + * Extract a unicode codepoint from a specific offset in text. + * Invalid byte sequences count as a single code point, U+FFFD + * (the Unicode replacement character ``). * - * @since 6.9.0 - * @access private + * This function does not permit passing negative indices and will return + * null if such are provided. * - * @see utf8_decode() + * @param string $text Input text from which to extract. + * @param int $byte_offset Start at this byte offset in the input text. + * @param int $matched_bytes How many bytes were matched to produce the codepoint. * - * @param string $utf8_text Text treated as UTF-8 bytes. - * @return string Text converted into ISO-8859-1. + * @return int Unicode codepoint. */ -function _wp_utf8_decode_fallback( $utf8_text ) { - $utf8_text = (string) $utf8_text; - $at = 0; - $was_at = 0; - $end = strlen( $utf8_text ); - $iso_8859_1_text = ''; - - while ( $at < $end ) { - // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F. - $ascii_byte_count = strspn( - $utf8_text, - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . - " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", - $at - ); - - if ( $ascii_byte_count > 0 ) { - $at += $ascii_byte_count; - continue; - } - - $next_at = $at; - $invalid_length = 0; - $found = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 ); - $span_length = $next_at - $at; - $next_byte = '?'; +function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) { + $position_in_input = $byte_offset; + $codepoint_at = $byte_offset; + $end_byte = strlen( $text ); + $codepoint = null; + $decoder_state = UTF8_DECODER_ACCEPT; - if ( 1 !== $found ) { - if ( $invalid_length > 0 ) { - $next_byte = ''; - goto flush_sub_part; - } + // Get to the start of the string. + while ( $position_in_input < $end_byte ) { + $decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state ); + if ( UTF8_DECODER_ACCEPT === $decoder_state ) { + ++$position_in_input; + $codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) ); break; - } - - // All convertible code points are two-bytes long. - $byte1 = ord( $utf8_text[ $at ] ); - if ( 0xC0 !== ( $byte1 & 0xE0 ) ) { - goto flush_sub_part; - } - - // All convertible code points are not greater than U+FF. - $byte2 = ord( $utf8_text[ $at + 1 ] ); - $code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) ); - if ( $code_point > 0xFF ) { - goto flush_sub_part; - } - - $next_byte = chr( $code_point ); - - flush_sub_part: - $iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at ); - $iso_8859_1_text .= $next_byte; - $at += $span_length; - $was_at = $at; - - if ( $invalid_length > 0 ) { - $iso_8859_1_text .= '?'; - $at += $invalid_length; - $was_at = $at; + } elseif ( UTF8_DECODER_REJECT === $decoder_state ) { + // "\u{FFFD}" is not supported in PHP 5.6. + $codepoint = utf8_ord( "\xEF\xBF\xBD" ); + break; + } else { + ++$position_in_input; } } - if ( 0 === $was_at ) { - return $utf8_text; - } + $matched_bytes = $position_in_input - $byte_offset; - $iso_8859_1_text .= substr( $utf8_text, $was_at ); - return $iso_8859_1_text; + return $codepoint; } /** - * Indicates if a given byte stream represents valid UTF-8. - * - * Note that unpaired surrogate halves are not valid UTF-8 and will be rejected. - * - * Example: - * - * true === utf8_is_valid_byte_stream( 'Hello, World! 🌎' ); - * - * false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at ); - * 12 === $error_at; - * - * false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at ); - * 23 === $error_at; - * - * false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at ); - * 15 === $error_at; + * Convert a UTF-8 byte sequence to its Unicode codepoint. * - * @param string $bytes Text to validate as UTF-8 bytes. - * @param int $starting_byte Byte offset in string where decoding should begin. - * @param int|null $first_error_byte_at Optional. If provided and byte stream fails to validate, - * will be set to the byte offset where the first invalid - * byte appeared. Otherwise, will not be set. + * @param string $character UTF-8 encoded byte sequence representing a single Unicode character. * - * @return bool Whether the given byte stream represents valid UTF-8. - * @since {WP_VERSION} + * @return int Unicode codepoint. */ -function utf8_is_valid_byte_stream( string $bytes, int $starting_byte = 0, ?int &$first_error_byte_at = null ): bool { - $state = UTF8_DECODER_ACCEPT; - $last_start_at = $starting_byte; +function utf8_ord( string $character ): int { + // Convert the byte sequence to its binary representation. + $bytes = unpack( 'C*', $character ); - for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) { - if ( UTF8_DECODER_ACCEPT === $state ) { - $last_start_at = $at; - } + // Initialize the codepoint. + $codepoint = 0; - $state = utf8_decoder_apply_byte( $bytes[ $at ], $state ); + // Calculate the codepoint based on the number of bytes. + if ( 1 === count( $bytes ) ) { + $codepoint = $bytes[1]; + } elseif ( 2 === count( $bytes ) ) { + $codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F ); + } elseif ( 3 === count( $bytes ) ) { + $codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F ); + } elseif ( 4 === count( $bytes ) ) { + $codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F ); } - if ( UTF8_DECODER_ACCEPT === $state ) { - return true; - } else { - $first_error_byte_at = $last_start_at; - - return false; - } + return $codepoint; } -/** - * Returns number of code points found within a UTF-8 string, similar to `strlen()`. - * - * If the byte stream fails to properly decode as UTF-8 this function will set the - * byte index of the first error byte and report the number of decoded code points. - * - * @param string $bytes Text for which to count code points. - * @param int|null $first_error_byte_at Optional. If provided, will be set upon finding - * the first invalid byte. - * - * @return int How many code points were decoded in the given byte stream before an error - * or before reaching the end of the string. - * @since {WP_VERSION} - */ -function utf8_codepoint_count( string $bytes, ?int &$first_error_byte_at = null ): int { - $state = UTF8_DECODER_ACCEPT; - $last_start_at = 0; - $count = 0; - $codepoint = 0; - - for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) { - if ( UTF8_DECODER_ACCEPT === $state ) { - $last_start_at = $at; - } - - $state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint ); - - if ( UTF8_DECODER_ACCEPT === $state ) { - ++$count; - } - } - - if ( UTF8_DECODER_ACCEPT !== $state ) { - $first_error_byte_at = $last_start_at; - } - - return $count; -} /** * Inner loop for a number of UTF-8 decoding-related functions. @@ -729,138 +152,3 @@ function utf8_decoder_apply_byte( string $byte, int $state, int &$codepoint = 0 return ord( $state_table[ 256 + ( $state * 16 ) + $type ] ); } - -/** - * Extract a slice of a text by code point, where invalid byte sequences count - * as a single code point, U+FFFD (the Unicode replacement character `�`). - * - * This function does not permit passing negative indices and will return - * the original string if such are provide. - * - * @param string $text Input text from which to extract. - * @param int $from Start extracting after this many code-points. - * @param int $length Extract this many code points. - * - * @return string Extracted slice of input string. - */ -function utf8_substr( string $text, int $from = 0, ?int $length = null ): string { - if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) { - return $text; - } - - $position_in_input = 0; - $codepoint_at = 0; - $end_byte = strlen( $text ); - $buffer = ''; - $seen_codepoints = 0; - $sliced_codepoints = 0; - $decoder_state = UTF8_DECODER_ACCEPT; - - // Get to the start of the string. - while ( $position_in_input < $end_byte && $seen_codepoints < $length ) { - $decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state ); - - if ( UTF8_DECODER_ACCEPT === $decoder_state ) { - ++$position_in_input; - - if ( $seen_codepoints >= $from ) { - ++$sliced_codepoints; - $buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at ); - } - - ++$seen_codepoints; - $codepoint_at = $position_in_input; - } elseif ( UTF8_DECODER_REJECT === $decoder_state ) { - // "\u{FFFD}" is not supported in PHP 5.6. - $buffer .= "\xEF\xBF\xBD"; - - // Skip to the start of the next code point. - while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) { - $decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT ); - } - - ++$seen_codepoints; - $codepoint_at = $position_in_input; - $decoder_state = UTF8_DECODER_ACCEPT; - } else { - ++$position_in_input; - } - } - - return $buffer; -} - -/** - * Extract a unicode codepoint from a specific offset in text. - * Invalid byte sequences count as a single code point, U+FFFD - * (the Unicode replacement character ``). - * - * This function does not permit passing negative indices and will return - * null if such are provided. - * - * @param string $text Input text from which to extract. - * @param int $byte_offset Start at this byte offset in the input text. - * @param int $matched_bytes How many bytes were matched to produce the codepoint. - * - * @return int Unicode codepoint. - */ -function utf8_codepoint_at( string $text, int $byte_offset = 0, &$matched_bytes = 0 ) { - if ( $byte_offset < 0 ) { - return null; - } - - $position_in_input = $byte_offset; - $codepoint_at = $byte_offset; - $end_byte = strlen( $text ); - $codepoint = null; - $decoder_state = UTF8_DECODER_ACCEPT; - - // Get to the start of the string. - while ( $position_in_input < $end_byte ) { - $decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state ); - - if ( UTF8_DECODER_ACCEPT === $decoder_state ) { - ++$position_in_input; - $codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) ); - break; - } elseif ( UTF8_DECODER_REJECT === $decoder_state ) { - // "\u{FFFD}" is not supported in PHP 5.6. - $codepoint = utf8_ord( "\xEF\xBF\xBD" ); - break; - } else { - ++$position_in_input; - } - } - - $matched_bytes = $position_in_input - $byte_offset; - - return $codepoint; -} - -/** - * Convert a UTF-8 byte sequence to its Unicode codepoint. - * - * @param string $character UTF-8 encoded byte sequence representing a single Unicode character. - * - * @return int Unicode codepoint. - */ -function utf8_ord( string $character ): int { - // Convert the byte sequence to its binary representation. - $bytes = unpack( 'C*', $character ); - - // Initialize the codepoint. - $codepoint = 0; - - // Calculate the codepoint based on the number of bytes. - if ( 1 === count( $bytes ) ) { - $codepoint = $bytes[1]; - } elseif ( 2 === count( $bytes ) ) { - $codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F ); - } elseif ( 3 === count( $bytes ) ) { - $codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F ); - } elseif ( 4 === count( $bytes ) ) { - $codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F ); - } - - return $codepoint; -} diff --git a/components/Encoding/utf8.php b/components/Encoding/utf8.php new file mode 100644 index 00000000..0c74f7c1 --- /dev/null +++ b/components/Encoding/utf8.php @@ -0,0 +1,199 @@ +=' ) +) : + /** + * Replaces ill-formed UTF-8 byte sequences with the Unicode Replacement Character. + * + * Knowing what to do in the presence of text encoding issues can be complicated. + * This function replaces invalid spans of bytes to neutralize any corruption that + * may be there and prevent it from causing further problems downstream. + * + * However, it’s not always ideal to replace those bytes. In some settings it may + * be best to leave the invalid bytes in the string so that downstream code can handle + * them in a specific way. Replacing the bytes too early, like escaping for HTML too + * early, can introduce other forms of corruption and data loss. + * + * When in doubt, use this function to replace spans of invalid bytes. + * + * Replacement follows the “maximal subpart” algorithm for secure and interoperable + * strings. This can lead to sequences of multiple replacement characters in a row. + * + * Example: + * + * // Valid strings come through unchanged. + * 'test' === wp_scrub_utf8( 'test' ); + * + * // Invalid sequences of bytes are replaced. + * $invalid = "the byte \xC0 is never allowed in a UTF-8 string."; + * "the byte \u{FFFD} is never allowed in a UTF-8 string." === wp_scrub_utf8( $invalid, true ); + * 'the byte � is never allowed in a UTF-8 string.' === wp_scrub_utf8( $invalid, true ); + * + * // Maximal subparts are replaced individually. + * '.�.' === wp_scrub_utf8( ".\xC0." ); // C0 is never valid. + * '.�.' === wp_scrub_utf8( ".\xE2\x8C." ); // Missing A3 at end. + * '.��.' === wp_scrub_utf8( ".\xE2\x8C\xE2\x8C." ); // Maximal subparts replaced separately. + * '.��.' === wp_scrub_utf8( ".\xC1\xBF." ); // Overlong sequence. + * '.���.' === wp_scrub_utf8( ".\xED\xA0\x80." ); // Surrogate half. + * + * Note! The Unicode Replacement Character is itself a Unicode character (U+FFFD). + * Once a span of invalid bytes has been replaced by one, it will not be possible + * to know whether the replacement character was originally intended to be there + * or if it is the result of scrubbing bytes. It is ideal to leave replacement for + * display only, but some contexts (e.g. generating XML or passing data into a + * large language model) require valid input strings. + * + * @since 6.9.0 + * + * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630 + * + * @param string $text String which is assumed to be UTF-8 but may contain invalid sequences of bytes. + * @return string Input text with invalid sequences of bytes replaced with the Unicode replacement character. + */ + function wp_scrub_utf8( $text ) { + /* + * While it looks like setting the substitute character could fail, + * the internal PHP code will never fail when provided a valid + * code point as a number. In this case, there’s no need to check + * its return value to see if it succeeded. + */ + $prev_replacement_character = mb_substitute_character(); + mb_substitute_character( 0xFFFD ); + $scrubbed = mb_scrub( $text, 'UTF-8' ); + mb_substitute_character( $prev_replacement_character ); + + return $scrubbed; + } +else : + /** + * Fallback function for scrubbing UTF-8. + * + * @ignore + * @private + * + * @since 6.9.0 + */ + function wp_scrub_utf8( $text ) { + return _wp_scrub_utf8_fallback( $text ); + } +endif; + +function _wp_can_use_pcre_u( $set = null ) { + static $utf8_pcre = 'reset'; + + if ( null !== $set ) { + $utf8_pcre = $set; + } + + if ( 'reset' === $utf8_pcre ) { + // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- intentional error generated to detect PCRE/u support. + $utf8_pcre = @preg_match( '/^./u', 'a' ); + } + + return $utf8_pcre; +} + +if ( _wp_can_use_pcre_u() ) : + /** + * Returns whether the given string contains Unicode noncharacters. + * + * XML recommends against using noncharacters and HTML forbids their + * use in attribute names. Unicode recommends that they not be used + * in open exchange of data. + * + * Noncharacters are code points within the following ranges: + * - U+FDD0–U+FDEF + * - U+FFFE–U+FFFF + * - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF + * + * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612 + * @see https://www.w3.org/TR/xml/#charsets + * @see https://html.spec.whatwg.org/#attributes-2 + * + * @since 6.9.0 + * + * @param string $text Are there noncharacters in this string? + * @return bool Whether noncharacters were found in the string. + */ + function wp_has_noncharacters( string $text ): bool { + return 1 === preg_match( + '/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u', + $text + ); + } +else : + /** + * Fallback function for detecting noncharacters in a text. + * + * @ignore + * @private + * + * @since 6.9.0 + */ + function wp_has_noncharacters( string $text ): bool { + return _wp_has_noncharacters_fallback( $text ); + } +endif; diff --git a/composer-ci-matrix-tests.json b/composer-ci-matrix-tests.json index 91c612e2..948e04a0 100644 --- a/composer-ci-matrix-tests.json +++ b/composer-ci-matrix-tests.json @@ -1,94 +1,89 @@ { - "name": "wordpress/components", - "type": "library", - "description": "WordPress Components", - "keywords": [ - "wordpress", - "components" - ], - "homepage": "https://wordpress.org", - "license": "GPL-2.0-or-later", - "authors": [ - { - "name": "Adam Zielinski", - "email": "adam@adamziel.com" - }, - { - "name": "WordPress Contributors" - } - ], - "require": { - "php": ">=7.2", - "ext-json": "*", - "ext-mbstring": "*" - }, - "require-dev": { - "yoast/phpunit-polyfills": "^4.0.0", - "phpcompatibility/php-compatibility": "^9.3.5", - "phpunit/phpunit": "8.5.x || ^9.5" - }, - "autoload": { - "exclude-from-classmap": [ - "**/Tests/", - "**/bin/", - "/Tests/" - ], - "classmap": [ - "components/BlockParser/", - "components/Blueprints/", - "components/Blueprints/vendor-patched/", - "components/CLI/", - "components/DataLiberation/", - "components/DataLiberation/vendor-patched/", - "components/Filesystem/", - "components/Git/", - "components/HTML/./", - "components/HttpClient/", - "components/HttpServer/", - "components/Markdown/", - "components/Markdown/vendor-patched", - "components/Merge/", - "components/Merge/vendor-patched", - "components/ByteStream/", - "components/XML/", - "components/Zip/" - ], - "files": [ - "components/DataLiberation/URL/functions.php", - "components/Encoding/utf8-decoder.php", - "components/Encoding/utf8-encoder.php", - "components/Filesystem/functions.php", - "components/Zip/functions.php", - "components/Polyfill/wordpress.php", - "components/Polyfill/mbstring.php", - "components/Git/functions.php" - ], - "psr-4": { - "Rowbot\\": "components/DataLiberation/vendor-patched/", - "Brick\\": "components/DataLiberation/vendor-patched/", - "WordPress\\CORSProxy\\": "components/CORSProxy/" - } - }, - "scripts": { - "build-blueprints-phar": "box compile -c phar-box.json", - "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts", - "test": "phpunit -c phpunit.xml", - "lint": "phpcs .", - "lint-fix": "phpcbf ." - }, - "repositories": [ - { - "type": "path", - "url": "components/*", - "options": { - "symlink": true - } - } - ], - "minimum-stability": "dev", - "config": { - "allow-plugins": { - "dealerdirect/phpcodesniffer-composer-installer": true - } - } + "name": "wordpress/components", + "type": "library", + "description": "WordPress Components", + "keywords": ["wordpress", "components"], + "homepage": "https://wordpress.org", + "license": "GPL-2.0-or-later", + "authors": [ + { + "name": "Adam Zielinski", + "email": "adam@adamziel.com" + }, + { + "name": "WordPress Contributors" + } + ], + "require": { + "php": ">=7.2", + "ext-json": "*", + "ext-mbstring": "*" + }, + "require-dev": { + "yoast/phpunit-polyfills": "^4.0.0", + "phpcompatibility/php-compatibility": "^9.3.5", + "phpunit/phpunit": "8.5.x || ^9.5" + }, + "autoload": { + "exclude-from-classmap": ["**/Tests/", "**/bin/", "/Tests/"], + "classmap": [ + "components/BlockParser/", + "components/Blueprints/", + "components/Blueprints/vendor-patched/", + "components/CLI/", + "components/DataLiberation/", + "components/DataLiberation/vendor-patched/", + "components/Filesystem/", + "components/Git/", + "components/HTML/./", + "components/HttpClient/", + "components/HttpServer/", + "components/Markdown/", + "components/Markdown/vendor-patched", + "components/Merge/", + "components/Merge/vendor-patched", + "components/ByteStream/", + "components/XML/", + "components/Zip/" + ], + "files": [ + "components/DataLiberation/URL/functions.php", + "components/Encoding/utf8.php", + "components/Encoding/compat-utf8.php", + "components/Encoding/utf8-encoder.php", + "components/Encoding/utf8-decoder.php", + "components/Filesystem/functions.php", + "components/Zip/functions.php", + "components/Polyfill/wordpress.php", + "components/Polyfill/mbstring.php", + "components/Git/functions.php" + ], + "psr-4": { + "Rowbot\\": "components/DataLiberation/vendor-patched/", + "Brick\\": "components/DataLiberation/vendor-patched/", + "WordPress\\CORSProxy\\": "components/CORSProxy/" + } + }, + "scripts": { + "build-blueprints-phar": "box compile -c phar-box.json", + "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts", + "test": "phpunit -c phpunit.xml", + "lint": "phpcs .", + "lint-fix": "phpcbf ." + }, + "repositories": [ + { + "type": "path", + "url": "components/*", + "options": { + "symlink": true + } + } + ], + "minimum-stability": "dev", + "config": { + "allow-plugins": { + "dealerdirect/phpcodesniffer-composer-installer": true + } + } } diff --git a/composer.json b/composer.json index ba5ce8a6..1423cfa3 100644 --- a/composer.json +++ b/composer.json @@ -1,107 +1,109 @@ { - "name": "wp-php-toolkit/php-toolkit", - "type": "library", - "description": "WordPress Components", - "keywords": [ - "wordpress", - "components" - ], - "homepage": "https://wordpress.org", - "license": "GPL-2.0-or-later", - "authors": [ - { - "name": "Adam Zielinski", - "email": "adam@adamziel.com" - }, - { - "name": "WordPress Contributors" - } - ], - "require": { - "php": ">=7.2", - "ext-json": "*", - "ext-mbstring": "*" - }, - "require-dev": { - "yoast/phpunit-polyfills": "2.0.0", - "squizlabs/php_codesniffer": "^3.13.4", - "phpcompatibility/php-compatibility": "^9.3.5", - "slevomat/coding-standard": "^8.21.1", - "wp-coding-standards/wpcs": "^3.2.0", - "phpunit/phpunit": "^9.5", - "phpstan/phpstan": "^1.0" - }, - "autoload": { - "exclude-from-classmap": [ - "**/Tests/", - "**/bin/", - "/Tests/" - ], - "classmap": [ - "components/BlockParser/", - "components/Blueprints/", - "components/Blueprints/vendor-patched/", - "components/CLI/", - "components/DataLiberation/", - "components/DataLiberation/vendor-patched/", - "components/Filesystem/", - "components/Git/", - "components/HTML/./", - "components/HttpClient/", - "components/HttpServer/", - "components/Markdown/", - "components/Markdown/vendor-patched", - "components/Merge/", - "components/Merge/vendor-patched", - "components/ByteStream/", - "components/ToolkitCodingStandards/", - "components/XML/", - "components/Zip/" - ], - "files": [ - "components/DataLiberation/URL/functions.php", - "components/Encoding/utf8-decoder.php", - "components/Encoding/utf8-encoder.php", - "components/Filesystem/functions.php", - "components/Zip/functions.php", - "components/Polyfill/wordpress.php", - "components/Polyfill/mbstring.php", - "components/Polyfill/php-functions.php", - "components/Git/functions.php" - ], - "psr-4": { - "Rowbot\\": "components/DataLiberation/vendor-patched/", - "Brick\\": "components/DataLiberation/vendor-patched/", - "WordPress\\CORSProxy\\": "components/CORSProxy/" - } - }, - "scripts": { - "build-php-toolkit-phar": "bash bin/build-libraries-phar.sh", - "build-blueprints-phar": "box compile -c phar-blueprints.json", - "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts", - "test": "phpunit -c phpunit.xml", - "lint": "phpcs -d memory_limit=1G .", - "lint-fix": "phpcbf -d memory_limit=1G ." - }, - "repositories": [ - { - "type": "path", - "url": "components/*", - "options": { - "symlink": true - } - } - ], - "minimum-stability": "dev", - "config": { - "allow-plugins": { - "dealerdirect/phpcodesniffer-composer-installer": true - } - }, - "archive": { - "exclude": [ - "/plugins", - "/examples" - ] - } + "name": "wp-php-toolkit/php-toolkit", + "type": "library", + "description": "WordPress Components", + "keywords": [ + "wordpress", + "components" + ], + "homepage": "https://wordpress.org", + "license": "GPL-2.0-or-later", + "authors": [ + { + "name": "Adam Zielinski", + "email": "adam@adamziel.com" + }, + { + "name": "WordPress Contributors" + } + ], + "require": { + "php": ">=7.2", + "ext-json": "*", + "ext-mbstring": "*" + }, + "require-dev": { + "yoast/phpunit-polyfills": "2.0.0", + "squizlabs/php_codesniffer": "^3.13.4", + "phpcompatibility/php-compatibility": "^9.3.5", + "slevomat/coding-standard": "^8.21.1", + "wp-coding-standards/wpcs": "^3.2.0", + "phpunit/phpunit": "^9.5", + "phpstan/phpstan": "^1.0" + }, + "autoload": { + "exclude-from-classmap": [ + "**/Tests/", + "**/bin/", + "/Tests/" + ], + "classmap": [ + "components/BlockParser/", + "components/Blueprints/", + "components/Blueprints/vendor-patched/", + "components/CLI/", + "components/DataLiberation/", + "components/DataLiberation/vendor-patched/", + "components/Filesystem/", + "components/Git/", + "components/HTML/./", + "components/HttpClient/", + "components/HttpServer/", + "components/Markdown/", + "components/Markdown/vendor-patched", + "components/Merge/", + "components/Merge/vendor-patched", + "components/ByteStream/", + "components/ToolkitCodingStandards/", + "components/XML/", + "components/Zip/" + ], + "files": [ + "components/DataLiberation/URL/functions.php", + "components/Encoding/utf8.php", + "components/Encoding/compat-utf8.php", + "components/Encoding/utf8-encoder.php", + "components/Encoding/utf8-decoder.php", + "components/Filesystem/functions.php", + "components/Zip/functions.php", + "components/Polyfill/wordpress.php", + "components/Polyfill/mbstring.php", + "components/Polyfill/php-functions.php", + "components/Git/functions.php" + ], + "psr-4": { + "Rowbot\\": "components/DataLiberation/vendor-patched/", + "Brick\\": "components/DataLiberation/vendor-patched/", + "WordPress\\CORSProxy\\": "components/CORSProxy/" + } + }, + "scripts": { + "build-php-toolkit-phar": "bash bin/build-libraries-phar.sh", + "build-blueprints-phar": "box compile -c phar-blueprints.json", + "regenerate-json-schema": "node components/Blueprints/Versions/Version2/json-schema/regenerate-schema.ts", + "test": "phpunit -c phpunit.xml", + "lint": "phpcs -d memory_limit=1G .", + "lint-fix": "phpcbf -d memory_limit=1G ." + }, + "repositories": [ + { + "type": "path", + "url": "components/*", + "options": { + "symlink": true + } + } + ], + "minimum-stability": "dev", + "config": { + "allow-plugins": { + "dealerdirect/phpcodesniffer-composer-installer": true + } + }, + "archive": { + "exclude": [ + "/plugins", + "/examples" + ] + } }