Skip to content

Commit

Permalink
Normalize UTF-8 charset slug detection.
Browse files Browse the repository at this point in the history
There are several exist places in Core that attempt to detect if a blog charset
is UTF-8. Each place attempts to perform the same check, except the logic is
spread throughout and there's no single method provided to make this
determination in a consistent way. The `_canonical_charset()` method exists,
but is marked private for use.

In this patch the new `unicode` module provides `is_utf8_charset()` as a method
taking an optional charset slug and indicating if it represents UTF-8,
examining all of the allowable variants of that slug. Associated code is
updated to use this new function, including `_canonical_charset()`. If no slug
is provided, it will look up the current `get_option( 'blog_charset' )`.

Finally, the test functions governing `_canonical_charset()` have been
rewritten as a single test with a data provider instead of as separate test
functions.

Developed in #6535
Discussed in https://core.trac.wordpress.org/ticket/61182

Fixes #61182.
Props dmsnell, jonsurrell.


git-svn-id: https://develop.svn.wordpress.org/trunk@58147 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed May 14, 2024
1 parent c3a4e8b commit d4967a3
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 60 deletions.
2 changes: 1 addition & 1 deletion src/wp-admin/options-reading.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
<?php
settings_fields( 'reading' );

if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset() ) {
add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );
}
?>
Expand Down
2 changes: 1 addition & 1 deletion src/wp-admin/options.php
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@

$mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );

if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset() ) {
$allowed_options['reading'][] = 'blog_charset';
}

Expand Down
4 changes: 2 additions & 2 deletions src/wp-includes/compat.php
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
* The solution below works only for UTF-8, so in case of a different
* charset just use built-in substr().
*/
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset( $encoding ) ) {
return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
}

Expand Down Expand Up @@ -176,7 +176,7 @@ function _mb_strlen( $str, $encoding = null ) {
* The solution below works only for UTF-8, so in case of a different charset
* just use built-in strlen().
*/
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset( $encoding ) ) {
return strlen( $str );
}

Expand Down
16 changes: 2 additions & 14 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -960,19 +960,7 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
$quote_style = ENT_QUOTES;
}

// Store the site charset as a static to avoid multiple calls to wp_load_alloptions().
if ( ! $charset ) {
static $_charset = null;
if ( ! isset( $_charset ) ) {
$alloptions = wp_load_alloptions();
$_charset = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : '';
}
$charset = $_charset;
}

if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) {
$charset = 'UTF-8';
}
$charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) );

$_quote_style = $quote_style;

Expand Down Expand Up @@ -1114,7 +1102,7 @@ function wp_check_invalid_utf8( $text, $strip = false ) {
// Store the site charset as a static to avoid multiple calls to get_option().
static $is_utf8 = null;
if ( ! isset( $is_utf8 ) ) {
$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
$is_utf8 = is_utf8_charset();
}
if ( ! $is_utf8 ) {
return $text;
Expand Down
20 changes: 15 additions & 5 deletions src/wp-includes/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -7474,17 +7474,27 @@ function get_tag_regex( $tag ) {
*
* @see https://core.trac.wordpress.org/ticket/23688
*
* @param string $charset A charset name.
* @param string $charset A charset name, e.g. "UTF-8", "Windows-1252", "SJIS".
* @return string The canonical form of the charset.
*/
function _canonical_charset( $charset ) {
if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) {

if ( is_utf8_charset( $charset ) ) {
return 'UTF-8';
}

if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) {

/*
* Normalize the ISO-8859-1 family of languages.
*
* This is not required for htmlspecialchars(), as it properly recognizes all of
* the input character sets that here are transformed into "ISO-8859-1".
*
* @todo Should this entire check be removed since it's not required for the stated purpose?
* @todo Should WordPress transform other potential charset equivalents, such as "latin1"?
*/
if (
( 0 === strcasecmp( 'iso-8859-1', $charset ) ) ||
( 0 === strcasecmp( 'iso8859-1', $charset ) )
) {
return 'ISO-8859-1';
}

Expand Down
1 change: 1 addition & 0 deletions src/wp-settings.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
wp_set_lang_dir();

// Load early WordPress files.
require ABSPATH . WPINC . '/unicode.php';
require ABSPATH . WPINC . '/class-wp-list-util.php';
require ABSPATH . WPINC . '/formatting.php';
require ABSPATH . WPINC . '/meta.php';
Expand Down
83 changes: 46 additions & 37 deletions tests/phpunit/tests/functions/canonicalCharset.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,54 @@
* @covers ::_canonical_charset
*/
class Tests_Functions_CanonicalCharset extends WP_UnitTestCase {

public function test_utf_8_lower() {
$this->assertSame( 'UTF-8', _canonical_charset( 'utf-8' ) );
}

public function test_utf_8_upper() {
$this->assertSame( 'UTF-8', _canonical_charset( 'UTF-8' ) );
}

public function test_utf_8_mixxed() {
$this->assertSame( 'UTF-8', _canonical_charset( 'Utf-8' ) );
}

public function test_utf_8() {
$this->assertSame( 'UTF-8', _canonical_charset( 'UTF8' ) );
}

public function test_iso_lower() {
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'iso-8859-1' ) );
}

public function test_iso_upper() {
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO-8859-1' ) );
}

public function test_iso_mixxed() {
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'Iso8859-1' ) );
}

public function test_iso() {
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO8859-1' ) );
}

public function test_random() {
$this->assertSame( 'random', _canonical_charset( 'random' ) );
/**
* Ensures that charset variants for common encodings normalize to the expected form.
*
* @ticket 61182
*
* @dataProvider data_charset_normalizations
*
* @param string $given_charset Potential charset provided by user.
* @param string $normalized_charset Expected normalized form of charset.
*/
public function test_properly_normalizes_charset_variants( $given_charset, $normalized_charset ) {
$this->assertSame(
$normalized_charset,
_canonical_charset( $given_charset ),
'Did not properly transform the provided charset into its normalized form.'
);
}

public function test_empty() {
$this->assertSame( '', _canonical_charset( '' ) );
/**
* Data provider.
*
* @return array[].
*/
public static function data_charset_normalizations() {
return array(
// UTF-8 family.
array( 'UTF-8', 'UTF-8' ),
array( 'Utf-8', 'UTF-8' ),
array( 'Utf-8', 'UTF-8' ),
array( 'UTF8', 'UTF-8' ),

// Almost UTF-8.
array( 'UTF-8*', 'UTF-8*' ),
array( 'UTF.8', 'UTF.8' ),
array( 'UTF88', 'UTF88' ),
array( 'UTF-7', 'UTF-7' ),
array( 'X-UTF-8', 'X-UTF-8' ),

// ISO-8859-1 family.
array( 'iso-8859-1', 'ISO-8859-1' ),
array( 'ISO-8859-1', 'ISO-8859-1' ),
array( 'Iso-8859-1', 'ISO-8859-1' ),
array( 'ISO8859-1', 'ISO-8859-1' ),

// Other charset slugs should not be adjusted.
array( 'random', 'random' ),
array( '', '' ),
);
}

/**
Expand Down

0 comments on commit d4967a3

Please sign in to comment.