Permalink
Browse files

Improve fallback UTF-8 locale detection

If the libc doesn't have modern enough routines, we use a fallback
mechanism to see if a locale is UTF-8 or not.  One component of this is
to look at the byte sequence for the currency symbol.  Obviously, if the
sequence isn't valid UTF-8, the locale isn't either.  But if it is valid
UTF-8, and hence might be a UTF-8 locale, this commit changes the
detection mechanism to see if the sequence evaluates, when interpreted
as UTF-8 to be a known Unicode currency symbol.  If so, the locale must
be UTF-8, as the odds of some other locale having a sequence that does
this are vanishingly small.

If the sequence doesn't evaluate to a currency symbol, that doesn't tell
us anything, as plenty of places have a string of letters be their
currency symbol.  Nor if the symbol is a '$', as that is invariant under
UTF-8 vs not, so doesn't help us.

This pretty much guarantees that a UTF-8 locale for the European Union
or the UK that otherwise looks like plain English (Latin script) will be
properly determined to be UTF-8, as the symbols for their currencies
will pass this test.
  • Loading branch information...
khwilliamson committed Jan 6, 2018
1 parent 6745fe9 commit f67e8d85093cd5475aec9296ad0ad3fbef9154e0
Showing with 61 additions and 23 deletions.
  1. +1 −0 embedvar.h
  2. +1 −0 intrpvar.h
  3. +56 −23 locale.c
  4. +2 −0 perl.c
  5. +1 −0 sv.c
View
@@ -42,6 +42,7 @@
#define PL_AboveLatin1 (vTHX->IAboveLatin1)
#define PL_Assigned_invlist (vTHX->IAssigned_invlist)
#define PL_Currency_Symbol (vTHX->ICurrency_Symbol)
#define PL_DBcontrol (vTHX->IDBcontrol)
#define PL_DBcv (vTHX->IDBcv)
#define PL_DBgv (vTHX->IDBgv)
View
@@ -627,6 +627,7 @@ PERLVAR(I, Latin1, SV *)
PERLVAR(I, UpperLatin1, SV *) /* Code points 128 - 255 */
PERLVAR(I, AboveLatin1, SV *)
PERLVAR(I, InBitmap, SV *)
PERLVAR(I, Currency_Symbol, SV *)
PERLVAR(I, NonL1NonFinalFold, SV *)
PERLVAR(I, HasMultiCharFold, SV *)
View
@@ -3162,7 +3162,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
char * name_pos; /* position of 'delimited' in the cache, or 0
if not there */
# ifdef LC_ALL
assert(category != LC_ALL);
@@ -3349,18 +3348,18 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
# ifdef USE_LOCALE_MONETARY
/* If have LC_MONETARY, we can look at the currency symbol. Often that
* will be in the native script. We do this one first because there is
* just one string to examine, so potentially avoids work */
* will be in the native script and/or use a Unicode currency symbol */
{
const char *original_monetary_locale
= switch_category_locale_to_template(LC_MONETARY,
category,
save_input_locale);
bool only_ascii = FALSE;
= switch_category_locale_to_template(
LC_MONETARY,
category,
save_input_locale);
const U8 * currency_string
= (const U8 *) my_nl_langinfo(PERL_CRNCYSTR, FALSE);
/* 2nd param not relevant for this item */
/* 2nd param not relevant for this item */
const U8 * e = currency_string + strlen((char *) currency_string);
const U8 * first_variant;
assert( *currency_string == '-'
@@ -3369,26 +3368,60 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
currency_string++;
if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
if (! is_utf8_invariant_string_loc(currency_string,
e - currency_string,
&first_variant))
{
DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
only_ascii = TRUE;
}
else {
is_utf8 = is_strict_utf8_string(first_variant, 0);
}
SCX_enum this_script;
restore_switched_locale(LC_MONETARY, original_monetary_locale);
if ( ! is_strict_utf8_string(first_variant, e - first_variant)
|| ! isSCRIPT_RUN((U8 * ) currency_string, e, TRUE, &this_script)
|| this_script == SCX_Unknown)
{
is_utf8 = FALSE;
restore_switched_locale(LC_MONETARY, original_monetary_locale);
goto finish_and_return;
}
/* Here the currency string contains a variant under UTF-8, and
* when interpreted as UTF-8, the string as a whole is in a
* valid single Unicode script. Look at the string's
* individual characters. If one sequence of the UTF-8 variant
* bytes, when treated as UTF-8, evaluates to a code point
* which is a Unicode currency symbol, then this must be a
* UTF-8 locale. The odds that any other locale would have
* such a sequence in its currency symbol that would
* coincidentally look like a valid Unicode currency symbol are
* vanishingly small */
if (PL_Currency_Symbol == NULL) {
PL_Currency_Symbol =
_new_invlist_C_array(Currency_Symbol_invlist);
}
if (! only_ascii) {
while (currency_string < e) {
IV cp;
Size_t len;
/* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
* otherwise assume the locale is UTF-8 if and only if the symbol
* is non-ascii UTF-8. */
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
save_input_locale, is_utf8));
goto finish_and_return;
if (UTF8_IS_INVARIANT(*currency_string)) {
currency_string++;
continue;
}
cp = utf8_to_uvchr_buf(currency_string, e, &len);
if (_invlist_search(PL_Currency_Symbol, cp) > 0) {
is_utf8 = TRUE;
restore_switched_locale(LC_MONETARY,
original_monetary_locale);
goto finish_and_return;
}
currency_string += len;
}
}
restore_switched_locale(LC_MONETARY, original_monetary_locale);
}
# endif /* USE_LOCALE_MONETARY */
View
2 perl.c
@@ -1175,6 +1175,7 @@ perl_destruct(pTHXx)
SvREFCNT_dec(PL_utf8_foldclosures);
SvREFCNT_dec(PL_AboveLatin1);
SvREFCNT_dec(PL_InBitmap);
SvREFCNT_dec(PL_Currency_Symbol);
SvREFCNT_dec(PL_UpperLatin1);
SvREFCNT_dec(PL_Latin1);
SvREFCNT_dec(PL_NonL1NonFinalFold);
@@ -1191,6 +1192,7 @@ perl_destruct(pTHXx)
PL_utf8_idcont = NULL;
PL_utf8_foldclosures = NULL;
PL_AboveLatin1 = NULL;
PL_Currency_Symbol = NULL;
PL_InBitmap = NULL;
PL_HasMultiCharFold = NULL;
#ifdef USE_LOCALE_CTYPE
View
1 sv.c
@@ -15561,6 +15561,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_UpperLatin1 = sv_dup_inc(proto_perl->IUpperLatin1, param);
PL_AboveLatin1 = sv_dup_inc(proto_perl->IAboveLatin1, param);
PL_InBitmap = sv_dup_inc(proto_perl->IInBitmap, param);
PL_Currency_Symbol = sv_dup_inc(proto_perl->ICurrency_Symbol, param);
PL_NonL1NonFinalFold = sv_dup_inc(proto_perl->INonL1NonFinalFold, param);
PL_HasMultiCharFold = sv_dup_inc(proto_perl->IHasMultiCharFold, param);

0 comments on commit f67e8d8

Please sign in to comment.