Skip to content

Commit

Permalink
locale.c: Improved fallback UTF-8 locale detection
Browse files Browse the repository at this point in the history
This adds some more checks for when the platform lacks mbtowc().  We can
check if things like isprint(), toupper() match what a UTF-8 locale
would do.  If not, we can rule out UTF-8.
  • Loading branch information
khwilliamson committed Jan 9, 2018
1 parent 627c39a commit 4b9e970
Showing 1 changed file with 150 additions and 12 deletions.
162 changes: 150 additions & 12 deletions locale.c
Expand Up @@ -3231,9 +3231,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
/* Here we don't have stored the utf8ness for the input locale. We have to /* Here we don't have stored the utf8ness for the input locale. We have to
* calculate it */ * calculate it */


# if defined(USE_LOCALE_CTYPE) \ # ifdef USE_LOCALE_CTYPE
&& ( (defined(HAS_NL_LANGINFO) && defined(CODESET)) \
|| (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))


{ {
const char *original_ctype_locale const char *original_ctype_locale
Expand Down Expand Up @@ -3329,21 +3327,161 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)


is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
&& wc == (wchar_t) UNICODE_REPLACEMENT); && wc == (wchar_t) UNICODE_REPLACEMENT);
restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;
}

# else /* Below, don't have mbtowc nor mbrtowc */

/* Here, we don't have mbtowc(). We can use earlier functions to see
* if the first 256 code points match what we expect in a UTF-8 locale.
* If they don't, we can rule out UTF-8. But if they do match, it
* could also be 8859-1, which is indistinguishable in the first 256
* code points by these functions, or it could be something very
* similar to 8859-1; we can't know for sure. There is also a bug in
* some platforms, in which locales that are ASCIIish-only can't be
* distinguished from UTF-8 by these functions. */

{
unsigned int i;
unsigned int maybe_ASCII = 0;

for (i = 0; i < 256; i++) {

/* This makes this portable to EBCDIC */
unsigned int j = LATIN1_TO_NATIVE(i);

U8 dummy_buf[UTF8_MAXBYTES_CASE+1];
STRLEN dummy_len;
IV uc;

if ( cBOOL(isalnum(j)) != cBOOL(isALPHANUMERIC(j))
|| cBOOL(isalpha(j)) != cBOOL(isALPHA_L1(j))
|| cBOOL(iscntrl(j)) != cBOOL(isCNTRL_L1(j))
|| cBOOL(isdigit(j)) != cBOOL(isDIGIT_L1(j))
|| cBOOL(isgraph(j)) != cBOOL(isGRAPH_L1(j))
|| cBOOL(islower(j)) != cBOOL(isLOWER_L1(j))
|| cBOOL(isprint(j)) != cBOOL(isPRINT_L1(j))
|| cBOOL(ispunct(j)) != cBOOL(isPUNCT_L1(j))
|| cBOOL(isspace(j)) != cBOOL(isSPACE_L1(j))
|| cBOOL(isupper(j)) != cBOOL(isUPPER_L1(j))
|| cBOOL(isxdigit(j))!= cBOOL(isXDIGIT_L1(j))
|| tolower(j) != (int) toLOWER_L1(j))
{
/* An ASCII locale could have both cntrl, and its
* complement return 0 */
if (i > 127 && iscntrl(j) == 0 && isprint(j) == 0) {
maybe_ASCII++;
continue;
}

/* Otherwise something really failed and isn't any of:
* ASCII, 8859-1, nor UTF-8 */
is_utf8 = FALSE;
restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;
}

/* Calculate the Unicode uppercase */
uc = toUPPER_uvchr(j, dummy_buf, &dummy_len);

/* If it is out of range of these functions, they will return
* the input itself */
if (uc > 255) {
uc = j;
}
if (toupper(j) != uc) {
is_utf8 = FALSE;
restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;
}
} /* End of 0..255 loop */

if (maybe_ASCII == 0) {

/* Here, all code points 0-255 match what both ISO 8859-1 (or a
* very similar one) and a UTF-8 locale think. But we need
* more information to distinguish between these two */

# ifdef MB_CUR_MAX

/* In the unlikely event that the compiler has MB_CUR_MAX,
* from tests above we have that extra information, as this is
* a multi-byte locale. khw doesn't think there exist any
* other multi-byte locales which are consistent with 8859-1
* in the range 0-255. (This event is unlikely because we
* only got here because mbtowc() doesn't exist on this
* platform, and it's unlikely that such a platform would have
* MB_CUR_MAX) */

is_utf8 = TRUE;
restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;

# endif

/* Otherwise drop down to the other tests */
}
else {

/* Implementations of ASCII locales could decide that all
* isfoo() functions for non-ASCII return 0. khw thinks it
* sufficient to have tested just iscntrl() and its complement,
* isprint(), which we did above and counted the results. But
* if the count is fewer than all such code points, it's a
* defective locale definition, and it's not 8859-1, so can't
* be UTF8 either */

if (maybe_ASCII < 128) {
is_utf8 = FALSE;
restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;
}

/* Here, we have a locale that looks to be ASCII. As mentioned
* above, some, platforms have the bug where all isfoo()
* functions return 0 for all non-ASCII code points in a UTF-8
* locale. (You must use the iswfoo() equivalents for them to
* work properly, even though the code point fits in a byte.
* The same applies to the case changing functions.) This makes
* UTF-8 locales indistinguishable from ASCII locale based on
* these functions. (khw presumes that the bug isn't just for
* UTF-8 locales, but any multi-byte one.) We need further
* information to decide between ASCII and UTF-8. This section
* of code doesn't get compiled unless we are missing some late
* adders to C89, so it's unlikey any of the wide char
* functions will be available. So we continue on to try other
* checks */
}
} }


restore_switched_locale(LC_CTYPE, original_ctype_locale); restore_switched_locale(LC_CTYPE, original_ctype_locale);
goto finish_and_return;
}


# endif # endif /* All LC_CTYPE sub-varieties */

} /* End of USE_LC_CTYPE */

# else # else


/* Here, we must have a C89 compiler that doesn't have mbtowc(). Next /* To get here, we either are not to look at LC_CTYPE on this platform or
* try looking at the currency symbol to see if it disambiguates * we have a C89 compiler without mbtowc(), and the locale is one of:
* things. Often that will be in the native script, and if the symbol * a) UTF-8
* isn't in UTF-8, we know that the locale isn't. If it is non-ASCII * b) 8859-1-ish
* UTF-8, we infer that the locale is too, as the odds of a non-UTF8 * c) ASCII-ish
* string being valid UTF-8 are quite small */ * To try to tease apart which, we can look at various functions sensitive
* to other categories, if available. */

# endif /* End of #else for USE_LC_CTYPE */

/* There's no point in compiling the code below if we did use LC_CTYPE and
* have mbtowc(). (Also only compile if have access to the other
* categories) */

# if ( ! defined(USE_LOCALE_CTYPE) || ( ! defined(HAS_MBTOWC) \
&& ! defined(HAS_MBRTOWC))) \
&& ( (defined(USE_LOCALE_TIME) && defined(HAS_STRFTIME)) \
|| (defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)) \
|| (defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)))


# ifdef USE_LOCALE_MONETARY # ifdef USE_LOCALE_MONETARY


Expand Down

0 comments on commit 4b9e970

Please sign in to comment.