diff --git a/src/wp-includes/class-wp-query.php b/src/wp-includes/class-wp-query.php index cf07b07d977c3..96fae867a3c6c 100644 --- a/src/wp-includes/class-wp-query.php +++ b/src/wp-includes/class-wp-query.php @@ -1431,7 +1431,36 @@ protected function parse_search( &$query_vars ) { $query_vars['s'] = urldecode( $query_vars['s'] ); } // There are no line breaks in fields. - $query_vars['s'] = str_replace( array( "\r", "\n" ), '', $query_vars['s'] ); + $query_vars['s'] = str_replace( array( "\r", "\n" ), '', $query_vars['s'] ); + /* + * Normalize Unicode whitespace (Zs category) to a regular half-width space + * so that CJK ideographic spaces and other Unicode space separators are + * recognized as word separators in the search query. + * + * @see https://core.trac.wordpress.org/ticket/44296 + */ + $query_vars['s'] = str_replace( + array( + "\u{00A0}", // No-Break Space. + "\u{1680}", // Ogham Space Mark. + "\u{2000}", // En Quad. + "\u{2001}", // Em Quad. + "\u{2002}", // En Space. + "\u{2003}", // Em Space. + "\u{2004}", // Three-Per-Em Space. + "\u{2005}", // Four-Per-Em Space. + "\u{2006}", // Six-Per-Em Space. + "\u{2007}", // Figure Space. + "\u{2008}", // Punctuation Space. + "\u{2009}", // Thin Space. + "\u{200A}", // Hair Space. + "\u{202F}", // Narrow No-Break Space. + "\u{205F}", // Medium Mathematical Space. + "\u{3000}", // Ideographic Space (CJK). + ), + ' ', + $query_vars['s'] + ); $query_vars['search_terms_count'] = 1; if ( ! empty( $query_vars['sentence'] ) ) { $query_vars['search_terms'] = array( $query_vars['s'] ); diff --git a/tests/phpunit/tests/query/search.php b/tests/phpunit/tests/query/search.php index 7bfbdec31c87d..bcab14fdf88ca 100644 --- a/tests/phpunit/tests/query/search.php +++ b/tests/phpunit/tests/query/search.php @@ -73,6 +73,39 @@ public function filter_wp_search_stopwords() { return array(); } + /** + * Tests that ideographic spaces are treated as separators even when stopwords are disabled. + * + * @ticket 44296 + */ + public function test_ideographic_space_separator_with_no_stopwords() { + $terms = "This\u{3000}is\u{3000}a\u{3000}search\u{3000}term"; + add_filter( 'wp_search_stopwords', array( $this, 'filter_wp_search_stopwords' ) ); + $query = new WP_Query( array( 's' => $terms ) ); + remove_filter( 'wp_search_stopwords', array( $this, 'filter_wp_search_stopwords' ) ); + + $this->assertSame( 5, $query->get( 'search_terms_count' ) ); + $this->assertSame( array( 'This', 'is', 'search', 'term' ), $query->get( 'search_terms' ) ); + } + + /** + * Tests that other Unicode space separators (Zs category) are also normalized. + * + * @ticket 44296 + */ + public function test_unicode_space_separators_are_treated_as_separators() { + add_filter( 'wp_search_stopwords', array( $this, 'filter_wp_search_stopwords' ) ); + + // U+2003 Em Space — a representative Unicode Zs-category space. + $terms = "search\u{2003}term"; + $query = new WP_Query( array( 's' => $terms ) ); + + remove_filter( 'wp_search_stopwords', array( $this, 'filter_wp_search_stopwords' ) ); + + $this->assertSame( 2, $query->get( 'search_terms_count' ) ); + $this->assertSame( array( 'search', 'term' ), $query->get( 'search_terms' ) ); + } + /** * @ticket 38099 */