From d4c66f385c27d9776f0b9e65c2df22b33a434f33 Mon Sep 17 00:00:00 2001 From: Sukhendu Sekhar Guria Date: Mon, 6 Apr 2026 11:03:54 +0530 Subject: [PATCH] Formatting: Refactor sanitize_title_with_dashes() with PCRE Unicode and WP_HTML_Decoder --- src/wp-includes/formatting.php | 81 ++++++++++++------- .../formatting/sanitizeTitleWithDashes.php | 41 ++++++++++ 2 files changed, 93 insertions(+), 29 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 498d676f5c20f..9f3b3f8d11f6e 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2274,12 +2274,31 @@ function sanitize_title_for_query( $title ) { * @param string $title The title to be sanitized. * @param string $raw_title Optional. Not used. Default empty. * @param string $context Optional. The operation for which the string is sanitized. - * When set to 'save', additional entities are converted to hyphens - * or stripped entirely. Default 'display'. + * When set to 'save', HTML entities are decoded to raw UTF-8 and + * Unicode dash punctuation and separators are converted to hyphens. + * Default 'display'. * @return string The sanitized title. */ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) { $title = strip_tags( $title ); + + if ( 'save' === $context ) { + /* + * Decode HTML entities to raw UTF-8, ensuring all representations of the same + * character are treated identically. + */ + $title = WP_HTML_Decoder::decode_text_node( $title ); + + $title = str_replace( '&', '', $title ); + + if ( _wp_can_use_pcre_u() ) { + $title = preg_replace( '~[\p{Pd}\p{Z}]~u', '-', $title ); + } + + // Convert forward slash to hyphen. + $title = str_replace( '/', '-', $title ); + } + // Preserve escaped octets. $title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title ); // Remove percent signs that are not part of an octet. @@ -2297,12 +2316,38 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa $title = strtolower( $title ); if ( 'save' === $context ) { - // Convert  , non-breaking hyphen, &ndash, and &mdash to hyphens. + /* + * Convert known dash punctuation and space separator variants to hyphens. + * + * These are the percent-encoded UTF-8 forms produced by utf8_uri_encode(). + * When _wp_can_use_pcre_u() is true, raw UTF-8 dash/space chars were already + * replaced by PCRE above, so these str_replace() calls become no-ops for those. + * They remain necessary to handle inputs that arrived as pre-encoded percent + * sequences. + */ $title = str_replace( array( '%c2%a0', '%e2%80%91', '%e2%80%93', '%e2%80%94' ), '-', $title ); - // Convert  , non-breaking hyphen, &ndash, and &mdash HTML entities to hyphens. - $title = str_replace( array( ' ', '‑', ' ', '–', '–', '—', '—' ), '-', $title ); - // Convert forward slash to hyphen. - $title = str_replace( '/', '-', $title ); + + // Convert space separator variants (percent-encoded) to hyphen. + $title = str_replace( + array( + '%e2%80%80', // En quad. + '%e2%80%81', // Em quad. + '%e2%80%82', // En space. + '%e2%80%83', // Em space. + '%e2%80%84', // Three-per-em space. + '%e2%80%85', // Four-per-em space. + '%e2%80%86', // Six-per-em space. + '%e2%80%87', // Figure space. + '%e2%80%88', // Punctuation space. + '%e2%80%89', // Thin space. + '%e2%80%8a', // Hair space. + '%e2%80%a8', // Line separator. + '%e2%80%a9', // Paragraph separator. + '%e2%80%af', // Narrow no-break space. + ), + '-', + $title + ); // Strip these characters entirely. $title = str_replace( @@ -2361,28 +2406,6 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa $title ); - // Convert non-visible characters that display with a width to hyphen. - $title = str_replace( - array( - '%e2%80%80', // En quad. - '%e2%80%81', // Em quad. - '%e2%80%82', // En space. - '%e2%80%83', // Em space. - '%e2%80%84', // Three-per-em space. - '%e2%80%85', // Four-per-em space. - '%e2%80%86', // Six-per-em space. - '%e2%80%87', // Figure space. - '%e2%80%88', // Punctuation space. - '%e2%80%89', // Thin space. - '%e2%80%8a', // Hair space. - '%e2%80%a8', // Line separator. - '%e2%80%a9', // Paragraph separator. - '%e2%80%af', // Narrow no-break space. - ), - '-', - $title - ); - // Convert × to 'x'. $title = str_replace( '%c3%97', 'x', $title ); } diff --git a/tests/phpunit/tests/formatting/sanitizeTitleWithDashes.php b/tests/phpunit/tests/formatting/sanitizeTitleWithDashes.php index 8a2ee4f9d9fee..d472db55e031b 100644 --- a/tests/phpunit/tests/formatting/sanitizeTitleWithDashes.php +++ b/tests/phpunit/tests/formatting/sanitizeTitleWithDashes.php @@ -327,6 +327,47 @@ public function data_converts_non_visible_characters_with_width_to_hyphen() { ); } + /** + * @ticket 64151 + */ + public function test_replaces_hex_nbsp_entity() { + $this->assertSame( 'dont-break-the-space', sanitize_title_with_dashes( "don\u{2019}t break the space", '', 'save' ) ); + } + + /** + * @ticket 64151 + */ + public function test_replaces_hex_ndash_mdash_entities() { + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do – the Dash', '', 'save' ) ); + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do — the Dash', '', 'save' ) ); + } + + /** + * @ticket 64151 + */ + public function test_replaces_hex_non_breaking_hyphen_entity() { + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( 'Do ‑ the Dash', '', 'save' ) ); + } + + /** + * @ticket 64151 + */ + public function test_replaces_additional_dash_punctuation() { + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2012} the Dash", '', 'save' ) ); + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2015} the Dash", '', 'save' ) ); + $this->assertSame( 'do-the-dash', sanitize_title_with_dashes( "Do \u{2010} the Dash", '', 'save' ) ); + } + + /** + * @ticket 64151 + */ + public function test_replaces_additional_space_separators() { + $this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{1680} the Space", '', 'save' ) ); + $this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{205F} the Space", '', 'save' ) ); + $this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{205F} the Space", '', 'save' ) ); + $this->assertSame( 'do-the-space', sanitize_title_with_dashes( "Do \u{3000} the Space", '', 'save' ) ); + } + /** * @ticket 47912 * @dataProvider data_non_visible_characters_with_width_to_hyphen_when_not_save