Skip to content

Commit

Permalink
locale.c: Revamp finding if locale is UTF-8
Browse files Browse the repository at this point in the history
This changes how this functionality works for the LC_CTYPE locale.  On
systems that have nl_langinfo() one can get a definitive answer from
just that.  Otherwise (or if that doesn't return properly) one can use
mbtowc() to check if the UTF-8 byte sequence for the Unicode REPLACEMENT
CHARACTER actually is considered to be that code point.  This is also
definitive.  If the maximum byte string length for a character is too
short to handle all Unicode UTF-8, we know without further checking that
this isn't a UTF-8 locale, so can avoid the mbtowc check.
  • Loading branch information
khwilliamson committed Jan 13, 2018
1 parent 0022a19 commit 1a19889
Showing 1 changed file with 57 additions and 67 deletions.
124 changes: 57 additions & 67 deletions locale.c
Expand Up @@ -41,6 +41,10 @@

#include "reentr.h"

#ifdef I_WCHAR
# include <wchar.h>
#endif

/* If the environment says to, we can output debugging information during
* initialization. This is done before option parsing, and before any thread
* creation, so can be a file-level static */
Expand Down Expand Up @@ -3060,7 +3064,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
const char *save_input_locale = NULL;

bool is_utf8 = FALSE; /* The return value */
STRLEN final_pos;

/* The variables below are for the cache of previous lookups using this
* function. The cache is a C string, described at the definition for
Expand Down Expand Up @@ -3149,14 +3152,14 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
/* Here we don't have stored the utf8ness for the input locale. We have to
* calculate it */

# if defined(USE_LOCALE_CTYPE) \
&& (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))

{ /* Next try nl_langinfo or MB_CUR_MAX if available */
# if defined(USE_LOCALE_CTYPE) \
&& ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
|| defined(HAS_MBTOWC))

char *save_ctype_locale = NULL;
{
const char *save_ctype_locale = NULL;

if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
if (category != LC_CTYPE) {

/* Get the current LC_CTYPE locale */
save_ctype_locale = do_setlocale_c(LC_CTYPE, NULL);
Expand Down Expand Up @@ -3187,9 +3190,23 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
save_input_locale));

/* Here the current LC_CTYPE is set to the locale of the category whose
* information is desired. This means that nl_langinfo() and MB_CUR_MAX
* information is desired. This means that nl_langinfo() and mbtowc()
* should give the correct results */

# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
calling the functions if we have this */

/* Standard UTF-8 needs at least 4 bytes to represent the maximum
* Unicode code point. */

DEBUG_L(PerlIO_printf(Perl_debug_log, "\tMB_CUR_MAX=%d\n",
(int) MB_CUR_MAX));
if (MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
is_utf8 = FALSE;
goto finish_ctype;
}

# endif
# if defined(HAS_NL_LANGINFO) && defined(CODESET)

{ /* The task is easiest if the platform has this POSIX 2001 function */
Expand All @@ -3200,11 +3217,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
"\tnllanginfo returned CODESET '%s'\n", codeset));

if (codeset && strNE(codeset, "")) {
/* If we switched LC_CTYPE, switch back */
if (save_ctype_locale) {
do_setlocale_c(LC_CTYPE, save_ctype_locale);
Safefree(save_ctype_locale);
}

/* If the implementation of foldEQ() somehow were
* to change to not go byte-by-byte, this could
Expand All @@ -3217,56 +3229,36 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log,
"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
codeset, is_utf8));
goto finish_and_return;
goto finish_ctype;
}
}

# endif
# ifdef MB_CUR_MAX

/* Here, either we don't have nl_langinfo, or it didn't return a
* codeset. Try MB_CUR_MAX */

/* Standard UTF-8 needs at least 4 bytes to represent the maximum
* Unicode code point. Since UTF-8 is the only non-single byte
* encoding we handle, we just say any such encoding is UTF-8, and if
* turns out to be wrong, other things will fail */
is_utf8 = (unsigned) MB_CUR_MAX >= STRLENs(MAX_UNICODE_UTF8);

DEBUG_L(PerlIO_printf(Perl_debug_log,
"\tMB_CUR_MAX=%d; ?UTF8 locale=%d\n",
(int) MB_CUR_MAX, is_utf8));
# if defined(HAS_MBTOWC)
/* The task can be accomplished essentially 100% if have this
* function. It was a late adder to C89, so very likely to have it. */

Safefree(save_input_locale);

# ifdef HAS_MBTOWC

/* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
* since they are both in the C99 standard. We can feed a known byte
* string to the latter function, and check that it gives the expected
* result */
if (is_utf8) {
{
wchar_t wc;
int len;

/* mbtowc() converts a byte string to a wide character. Feed a byte
* string to it and check that the result is the expected Unicode
* code point */

PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
errno = 0;
len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));

DEBUG_L(PerlIO_printf(Perl_debug_log,
"\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
len, (unsigned int) wc, errno));

if ( len != STRLENs(REPLACEMENT_CHARACTER_UTF8)
|| wc != (wchar_t) UNICODE_REPLACEMENT)
{
is_utf8 = FALSE;
DEBUG_L(PerlIO_printf(Perl_debug_log, "\replacement=U+%x\n",
(unsigned int)wc));
DEBUG_L(PerlIO_printf(Perl_debug_log,
"\treturn from mbtowc=%d; errno=%d; ?UTF8 locale=0\n",
len, errno));
}
is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
&& wc == (wchar_t) UNICODE_REPLACEMENT);
}

# endif
finish_ctype:

/* If we switched LC_CTYPE, switch back */
if (save_ctype_locale) {
Expand All @@ -3275,21 +3267,17 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
}

goto finish_and_return;

# endif

}

# else /* nl_langinfo should work if available, so don't bother compiling this
fallback code. The final fallback of looking at the name is
compiled, and will be executed if nl_langinfo fails */
# endif
# else
/* nl_langinfo not available or failed somehow. Next try looking at the
* currency symbol to see if it disambiguates things. Often that will be
* in the native script, and if the symbol isn't in UTF-8, we know that the
* locale isn't. If it is non-ASCII UTF-8, we infer that the locale is
* too, as the odds of a non-UTF8 string being valid UTF-8 are quite small
* */
/* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
* try looking at the currency symbol to see if it disambiguates
* things. Often that will be in the native script, and if the symbol
* isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
* UTF-8, we infer that the locale is too, as the odds of a non-UTF8
* string being valid UTF-8 are quite small */

# ifdef HAS_LOCALECONV
# ifdef USE_LOCALE_MONETARY
Expand Down Expand Up @@ -3535,9 +3523,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
}

# endif
# endif /* the code that is compiled when no nl_langinfo */

# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
UTF-8 locale */

/* As a last resort, look at the locale name to see if it matches
Expand All @@ -3547,7 +3533,9 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
* its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
* be a UTF-8 locale. Similarly for the other common names */

final_pos = strlen(save_input_locale) - 1;
{
const Size_t final_pos = strlen(save_input_locale) - 1;

if (final_pos >= 3) {
const char *name = save_input_locale;

Expand Down Expand Up @@ -3590,6 +3578,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
is_utf8 = TRUE;
goto finish_and_return;
}
}

# endif
# endif
Expand All @@ -3598,21 +3587,23 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
* since we are about to return FALSE anyway, there is no point in doing
* this extra work */

# if 0
# if 0
if (instr(save_input_locale, "8859")) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s has 8859 in name, not UTF-8 locale\n",
save_input_locale));
is_utf8 = FALSE;
goto finish_and_return;
}
# endif
# endif

DEBUG_L(PerlIO_printf(Perl_debug_log,
"Assuming locale %s is not a UTF-8 locale\n",
save_input_locale));
is_utf8 = FALSE;

# endif /* the code that is compiled when no modern LC_CTYPE */

finish_and_return:

/* Cache this result so we don't have to go through all this next time. */
Expand Down Expand Up @@ -3674,7 +3665,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)

#endif


bool
Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
{
Expand Down

0 comments on commit 1a19889

Please sign in to comment.