diff --git a/src/wp-admin/options-reading.php b/src/wp-admin/options-reading.php index 5dd40f37c4e3..8f3ba83a5f2c 100644 --- a/src/wp-admin/options-reading.php +++ b/src/wp-admin/options-reading.php @@ -64,7 +64,7 @@ 'blog_charset' ) ); } ?> diff --git a/src/wp-admin/options.php b/src/wp-admin/options.php index 8510095e2456..34b98ae294f9 100644 --- a/src/wp-admin/options.php +++ b/src/wp-admin/options.php @@ -160,7 +160,7 @@ $mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' ); -if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) { +if ( ! is_utf8_charset() ) { $allowed_options['reading'][] = 'blog_charset'; } diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index b4ef3d2f671a..c50fc69a047e 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -91,7 +91,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) { * The solution below works only for UTF-8, so in case of a different * charset just use built-in substr(). */ - if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) { + if ( ! is_utf8_charset( $encoding ) ) { return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); } @@ -176,7 +176,7 @@ function _mb_strlen( $str, $encoding = null ) { * The solution below works only for UTF-8, so in case of a different charset * just use built-in strlen(). */ - if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) { + if ( ! is_utf8_charset( $encoding ) ) { return strlen( $str ); } diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 77c772001c96..7c03c484ddb7 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -960,19 +960,7 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false, $quote_style = ENT_QUOTES; } - // Store the site charset as a static to avoid multiple calls to wp_load_alloptions(). - if ( ! $charset ) { - static $_charset = null; - if ( ! isset( $_charset ) ) { - $alloptions = wp_load_alloptions(); - $_charset = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : ''; - } - $charset = $_charset; - } - - if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) { - $charset = 'UTF-8'; - } + $charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) ); $_quote_style = $quote_style; @@ -1114,7 +1102,7 @@ function wp_check_invalid_utf8( $text, $strip = false ) { // Store the site charset as a static to avoid multiple calls to get_option(). static $is_utf8 = null; if ( ! isset( $is_utf8 ) ) { - $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ); + $is_utf8 = is_utf8_charset(); } if ( ! $is_utf8 ) { return $text; diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php index 0cc4c44ad12e..3634190663b6 100644 --- a/src/wp-includes/functions.php +++ b/src/wp-includes/functions.php @@ -7474,17 +7474,27 @@ function get_tag_regex( $tag ) { * * @see https://core.trac.wordpress.org/ticket/23688 * - * @param string $charset A charset name. + * @param string $charset A charset name, e.g. "UTF-8", "Windows-1252", "SJIS". * @return string The canonical form of the charset. */ function _canonical_charset( $charset ) { - if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) { - + if ( is_utf8_charset( $charset ) ) { return 'UTF-8'; } - if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) { - + /* + * Normalize the ISO-8859-1 family of languages. + * + * This is not required for htmlspecialchars(), as it properly recognizes all of + * the input character sets that here are transformed into "ISO-8859-1". + * + * @todo Should this entire check be removed since it's not required for the stated purpose? + * @todo Should WordPress transform other potential charset equivalents, such as "latin1"? + */ + if ( + ( 0 === strcasecmp( 'iso-8859-1', $charset ) ) || + ( 0 === strcasecmp( 'iso8859-1', $charset ) ) + ) { return 'ISO-8859-1'; } diff --git a/src/wp-settings.php b/src/wp-settings.php index 9673479bfab7..4d8a35ae8358 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -106,6 +106,7 @@ wp_set_lang_dir(); // Load early WordPress files. +require ABSPATH . WPINC . '/unicode.php'; require ABSPATH . WPINC . '/class-wp-list-util.php'; require ABSPATH . WPINC . '/formatting.php'; require ABSPATH . WPINC . '/meta.php'; diff --git a/tests/phpunit/tests/functions/canonicalCharset.php b/tests/phpunit/tests/functions/canonicalCharset.php index f13e0cca154e..b46b6f5ce5b4 100644 --- a/tests/phpunit/tests/functions/canonicalCharset.php +++ b/tests/phpunit/tests/functions/canonicalCharset.php @@ -10,45 +10,54 @@ * @covers ::_canonical_charset */ class Tests_Functions_CanonicalCharset extends WP_UnitTestCase { - - public function test_utf_8_lower() { - $this->assertSame( 'UTF-8', _canonical_charset( 'utf-8' ) ); - } - - public function test_utf_8_upper() { - $this->assertSame( 'UTF-8', _canonical_charset( 'UTF-8' ) ); - } - - public function test_utf_8_mixxed() { - $this->assertSame( 'UTF-8', _canonical_charset( 'Utf-8' ) ); - } - - public function test_utf_8() { - $this->assertSame( 'UTF-8', _canonical_charset( 'UTF8' ) ); - } - - public function test_iso_lower() { - $this->assertSame( 'ISO-8859-1', _canonical_charset( 'iso-8859-1' ) ); - } - - public function test_iso_upper() { - $this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO-8859-1' ) ); - } - - public function test_iso_mixxed() { - $this->assertSame( 'ISO-8859-1', _canonical_charset( 'Iso8859-1' ) ); - } - - public function test_iso() { - $this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO8859-1' ) ); - } - - public function test_random() { - $this->assertSame( 'random', _canonical_charset( 'random' ) ); + /** + * Ensures that charset variants for common encodings normalize to the expected form. + * + * @ticket 61182 + * + * @dataProvider data_charset_normalizations + * + * @param string $given_charset Potential charset provided by user. + * @param string $normalized_charset Expected normalized form of charset. + */ + public function test_properly_normalizes_charset_variants( $given_charset, $normalized_charset ) { + $this->assertSame( + $normalized_charset, + _canonical_charset( $given_charset ), + 'Did not properly transform the provided charset into its normalized form.' + ); } - public function test_empty() { - $this->assertSame( '', _canonical_charset( '' ) ); + /** + * Data provider. + * + * @return array[]. + */ + public static function data_charset_normalizations() { + return array( + // UTF-8 family. + array( 'UTF-8', 'UTF-8' ), + array( 'Utf-8', 'UTF-8' ), + array( 'Utf-8', 'UTF-8' ), + array( 'UTF8', 'UTF-8' ), + + // Almost UTF-8. + array( 'UTF-8*', 'UTF-8*' ), + array( 'UTF.8', 'UTF.8' ), + array( 'UTF88', 'UTF88' ), + array( 'UTF-7', 'UTF-7' ), + array( 'X-UTF-8', 'X-UTF-8' ), + + // ISO-8859-1 family. + array( 'iso-8859-1', 'ISO-8859-1' ), + array( 'ISO-8859-1', 'ISO-8859-1' ), + array( 'Iso-8859-1', 'ISO-8859-1' ), + array( 'ISO8859-1', 'ISO-8859-1' ), + + // Other charset slugs should not be adjusted. + array( 'random', 'random' ), + array( '', '' ), + ); } /**