Skip to content

Commit

Permalink
Cache locale UTF8-ness lookups
Browse files Browse the repository at this point in the history
Some locales are UTF-8, some are not.  Knowledge of this is needed in
various circumstances.  This commit saves the results of the last
several lookups so they don't have to be recalculated each time.

The full generality of POSIX locales is such that you can have error
messages be displayed in one locale, say Spanish, while other things are
in French.  To accommodate this generality, the program can loop through
all the locale categories finding the UTF8ness of the locale it points
to.  However, in almost all instances, people are going to be in either
French or in Spanish, and not in some combination.  Suppose it is a
French UTF-8 locale for all categories.  This new cache will know that
the French locale is UTF-8, and the queries for all but the first
category can return that immediately.

This simple cache avoids the overhead of hashes.

This also fixes a bug I realized exists in threaded perls, but haven't
reproduced.  We do not support locales in such perls, and the user must
not change the locale or 'use locale'.  But perl itself could change the
locale behind the scenes, leading to segfaults or incorrect results.
One such instance is the determination of UTF8ness.  But this only could
happen if the full generality of locales is used so that the categories
are not all in the same locale.  This could only happen (if the user
doesn't change locales) if the environment is such that the perl program
is started up so that the categories are in such a state.  This commit
fixes this potential bug by caching the UTF8ness of each category at
startup, before any threads are instantiated, and so checking for it
later just looks it up in the cache, without perl changing the locale.
  • Loading branch information
khwilliamson committed Jan 13, 2018
1 parent 398f244 commit f705e65
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 43 deletions.
1 change: 1 addition & 0 deletions embedvar.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@
#define PL_lastgotoprobe (vTHX->Ilastgotoprobe) #define PL_lastgotoprobe (vTHX->Ilastgotoprobe)
#define PL_laststatval (vTHX->Ilaststatval) #define PL_laststatval (vTHX->Ilaststatval)
#define PL_laststype (vTHX->Ilaststype) #define PL_laststype (vTHX->Ilaststype)
#define PL_locale_utf8ness (vTHX->Ilocale_utf8ness)
#define PL_localizing (vTHX->Ilocalizing) #define PL_localizing (vTHX->Ilocalizing)
#define PL_localpatches (vTHX->Ilocalpatches) #define PL_localpatches (vTHX->Ilocalpatches)
#define PL_lockhook (vTHX->Ilockhook) #define PL_lockhook (vTHX->Ilockhook)
Expand Down
2 changes: 2 additions & 0 deletions intrpvar.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ PERLVAR(I, exit_flags, U8) /* was exit() unexpected, etc. */
PERLVAR(I, utf8locale, bool) /* utf8 locale detected */ PERLVAR(I, utf8locale, bool) /* utf8 locale detected */
PERLVAR(I, in_utf8_CTYPE_locale, bool) PERLVAR(I, in_utf8_CTYPE_locale, bool)
PERLVAR(I, in_utf8_COLLATE_locale, bool) PERLVAR(I, in_utf8_COLLATE_locale, bool)
PERLVARA(I, locale_utf8ness, 256, char)

#ifdef USE_LOCALE_CTYPE #ifdef USE_LOCALE_CTYPE
PERLVAR(I, warn_locale, SV *) PERLVAR(I, warn_locale, SV *)
#endif #endif
Expand Down
248 changes: 205 additions & 43 deletions locale.c
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -76,24 +76,46 @@ static bool debug_initialization = FALSE;


#ifdef USE_LOCALE #ifdef USE_LOCALE


/* /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
* Standardize the locale name from a string returned by 'setlocale', possibly * looked up. This is in the form of a C string: */
* modifying that string.
* #define UTF8NESS_SEP "\v"
* The typical return value of setlocale() is either #define UTF8NESS_PREFIX "\a"
* (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
* (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL /* So, the string looks like:
* (the space-separated values represent the various sublocales,
* in some unspecified order). This is not handled by this function.
* *
* In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
* which is harmful for further use of the string in setlocale(). This
* function removes the trailing new line and everything up through the '='
* *
*/ * where the digit 0 after the \a indicates that the locale starting just
* after the preceding \v is not UTF-8, and the digit 1 mean it is. */

STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);

#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"

/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
* kept there always. The remining portion of the cache is LRU, with the
* oldest looked-up locale at the tail end */

STATIC char * STATIC char *
S_stdize_locale(pTHX_ char *locs) S_stdize_locale(pTHX_ char *locs)
{ {
/* Standardize the locale name from a string returned by 'setlocale',
* possibly modifying that string.
*
* The typical return value of setlocale() is either
* (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
* (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
* (the space-separated values represent the various sublocales,
* in some unspecified order). This is not handled by this function.
*
* In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
* which is harmful for further use of the string in setlocale(). This
* function removes the trailing new line and everything up through the '='
* */

const char * const s = strchr(locs, '='); const char * const s = strchr(locs, '=');
bool okay = TRUE; bool okay = TRUE;


Expand Down Expand Up @@ -2006,6 +2028,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
# endif # endif
# endif /* DEBUGGING */ # endif /* DEBUGGING */

/* Initialize the cache of the program's UTF-8ness for the always known
* locales C and POSIX */
my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
sizeof(PL_locale_utf8ness));

# ifdef LOCALE_ENVIRON_REQUIRED # ifdef LOCALE_ENVIRON_REQUIRED


/* /*
Expand Down Expand Up @@ -2362,8 +2390,24 @@ Perl_init_i18nl10n(pTHX_ int printwarn)


# endif # endif



for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {

# if defined(USE_ITHREADS)

/* This caches whether each category's locale is UTF-8 or not. This
* may involve changing the locale. It is ok to do this at
* initialization time before any threads have started, but not later.
* Caching means that if the program heeds our dictate not to change
* locales in threaded applications, this data will remain valid, and
* it may get queried without changing locales. If the environment is
* such that all categories have the same locale, this isn't needed, as
* the code will not change the locale; but this handles the uncommon
* case where the environment has disparate locales for the categories
* */
(void) _is_cur_LC_category_utf8(categories[i]);

# endif

Safefree(curlocales[i]); Safefree(curlocales[i]);
} }


Expand Down Expand Up @@ -3012,39 +3056,105 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
* English, it comes down to if the locale's name ends in something like * English, it comes down to if the locale's name ends in something like
* "UTF-8". It errs on the side of not being a UTF-8 locale. */ * "UTF-8". It errs on the side of not being a UTF-8 locale. */


/* Name of current locale corresponding to the input category */
const char *save_input_locale = NULL; const char *save_input_locale = NULL;

bool is_utf8 = FALSE; /* The return value */
STRLEN final_pos; STRLEN final_pos;


/* The variables below are for the cache of previous lookups using this
* function. The cache is a C string, described at the definition for
* 'C_and_POSIX_utf8ness'.
*
* The first part of the cache is fixed, for the C and POSIX locales. The
* varying part starts just after them. */
char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);

Size_t utf8ness_cache_size; /* Size of the varying portion */
Size_t input_name_len; /* Length in bytes of save_input_locale */
Size_t input_name_len_with_overhead; /* plus extra chars used to store
the name in the cache */
char * delimited; /* The name plus the delimiters used to store
it in the cache */
char * name_pos; /* position of 'delimited' in the cache, or 0
if not there */


# ifdef LC_ALL # ifdef LC_ALL


assert(category != LC_ALL); assert(category != LC_ALL);


# endif # endif


/* First dispose of the trivial cases */ /* Get the desired category's locale */
save_input_locale = do_setlocale_r(category, NULL); save_input_locale = do_setlocale_r(category, NULL);
if (! save_input_locale) { if (! save_input_locale) {
DEBUG_L(PerlIO_printf(Perl_debug_log, Perl_croak(aTHX_
"Could not find current locale for category %d\n", "panic: %s: %d: Could not find current locale for %s\n",
category)); __FILE__, __LINE__, category_name(category));
return FALSE; /* XXX maybe should croak */
} }

save_input_locale = stdize_locale(savepv(save_input_locale)); save_input_locale = stdize_locale(savepv(save_input_locale));
if (isNAME_C_OR_POSIX(save_input_locale)) { DEBUG_L(PerlIO_printf(Perl_debug_log,
DEBUG_L(PerlIO_printf(Perl_debug_log, "Current locale for %s is %s\n",
"Current locale for category %d is %s\n", category_name(category), save_input_locale));
category, save_input_locale));
input_name_len = strlen(save_input_locale);

/* In our cache, each name is accompanied by two delimiters and a single
* utf8ness digit */
input_name_len_with_overhead = input_name_len + 3;

/* Allocate and populate space for a copy of the name surrounded by the
* delimiters */
Newx(delimited, input_name_len_with_overhead, char);
delimited[0] = UTF8NESS_SEP[0];
Copy(save_input_locale, delimited + 1, input_name_len, char);
delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
delimited[input_name_len+2] = '\0';

/* And see if that is in the cache */
name_pos = instr(PL_locale_utf8ness, delimited);
if (name_pos) {
is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';

# ifdef DEBUGGING

if (DEBUG_Lv_TEST || debug_initialization) {
PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
save_input_locale, is_utf8);
}

# endif

/* And, if not already in that position, move it to the beginning of
* the non-constant portion of the list, since it is the most recently
* used. (We don't have to worry about overflow, since just moving
* existing names around) */
if (name_pos > utf8ness_cache) {
Move(utf8ness_cache,
utf8ness_cache + input_name_len_with_overhead,
input_name_len_with_overhead, char);
Copy(delimited,
utf8ness_cache,
input_name_len_with_overhead - 1, char);
utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
}

Safefree(delimited);
Safefree(save_input_locale); Safefree(save_input_locale);
return FALSE; return is_utf8;
} }


/* Here we don't have stored the utf8ness for the input locale. We have to
* calculate it */

# if defined(USE_LOCALE_CTYPE) \ # if defined(USE_LOCALE_CTYPE) \
&& (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET))) && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))


{ /* Next try nl_langinfo or MB_CUR_MAX if available */ { /* Next try nl_langinfo or MB_CUR_MAX if available */


char *save_ctype_locale = NULL; char *save_ctype_locale = NULL;
bool is_utf8;


if (category != LC_CTYPE) { /* These work only on LC_CTYPE */ if (category != LC_CTYPE) { /* These work only on LC_CTYPE */


Expand Down Expand Up @@ -3105,8 +3215,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
"\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
codeset, is_utf8)); codeset, is_utf8));
Safefree(save_input_locale); goto finish_and_return;
return is_utf8;
} }
} }


Expand Down Expand Up @@ -3163,7 +3272,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
Safefree(save_ctype_locale); Safefree(save_ctype_locale);
} }


return is_utf8; goto finish_and_return;


# endif # endif


Expand All @@ -3188,7 +3297,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
{ {
char *save_monetary_locale = NULL; char *save_monetary_locale = NULL;
bool only_ascii = FALSE; bool only_ascii = FALSE;
bool is_utf8 = FALSE;
struct lconv* lc; struct lconv* lc;


/* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of /* Like above for LC_CTYPE, we first set LC_MONETARY to the locale of
Expand Down Expand Up @@ -3245,8 +3353,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
* is non-ascii UTF-8. */ * is non-ascii UTF-8. */
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
save_input_locale, is_utf8)); save_input_locale, is_utf8));
Safefree(save_input_locale); goto finish_and_return;
return is_utf8;
} }
} }
cant_use_monetary: cant_use_monetary:
Expand Down Expand Up @@ -3331,8 +3438,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
save_input_locale, save_input_locale,
is_utf8_string((U8 *) formatted_time, 0))); is_utf8_string((U8 *) formatted_time, 0)));
Safefree(save_input_locale); is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
return is_utf8_string((U8 *) formatted_time, 0); goto finish_and_return;
} }


/* Falling off the end of the loop indicates all the names were just /* Falling off the end of the loop indicates all the names were just
Expand Down Expand Up @@ -3363,7 +3470,6 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
* are much more likely to have been translated. */ * are much more likely to have been translated. */
{ {
int e; int e;
bool is_utf8 = FALSE;
bool non_ascii = FALSE; bool non_ascii = FALSE;
char *save_messages_locale = NULL; char *save_messages_locale = NULL;
const char * errmsg = NULL; const char * errmsg = NULL;
Expand Down Expand Up @@ -3427,8 +3533,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", DEBUG_L(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
save_input_locale, save_input_locale,
is_utf8)); is_utf8));
Safefree(save_input_locale); goto finish_and_return;
return is_utf8;
} }


DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
Expand Down Expand Up @@ -3472,8 +3577,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s ends with UTF-8 in name\n", "Locale %s ends with UTF-8 in name\n",
save_input_locale)); save_input_locale));
Safefree(save_input_locale); is_utf8 = TRUE;
return TRUE; goto finish_and_return;
} }
} }
DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
Expand All @@ -3489,8 +3594,8 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s ends with 65001 in name, is UTF-8 locale\n", "Locale %s ends with 65001 in name, is UTF-8 locale\n",
save_input_locale)); save_input_locale));
Safefree(save_input_locale); is_utf8 = TRUE;
return TRUE; goto finish_and_return;
} }


# endif # endif
Expand All @@ -3504,16 +3609,73 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category)
DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
"Locale %s has 8859 in name, not UTF-8 locale\n", "Locale %s has 8859 in name, not UTF-8 locale\n",
save_input_locale)); save_input_locale));
Safefree(save_input_locale); is_utf8 = FALSE;
return FALSE; goto finish_and_return;
} }
# endif # endif


DEBUG_L(PerlIO_printf(Perl_debug_log, DEBUG_L(PerlIO_printf(Perl_debug_log,
"Assuming locale %s is not a UTF-8 locale\n", "Assuming locale %s is not a UTF-8 locale\n",
save_input_locale)); save_input_locale));
is_utf8 = FALSE;

finish_and_return:

/* Cache this result so we don't have to go through all this next time. */
utf8ness_cache_size = sizeof(PL_locale_utf8ness)
- (utf8ness_cache - PL_locale_utf8ness);

/* But we can't save it if it is too large for the total space available */
if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
Size_t utf8ness_cache_len = strlen(utf8ness_cache);

/* Here it can fit, but we may need to clear out the oldest cached
* result(s) to do so. Check */
if (utf8ness_cache_len + input_name_len_with_overhead
>= utf8ness_cache_size)
{
/* Here we have to clear something out to make room for this.
* Start looking at the rightmost place where it could fit and find
* the beginning of the entry that extends past that. */
char * cutoff = (char *) my_memrchr(utf8ness_cache,
UTF8NESS_SEP[0],
utf8ness_cache_size
- input_name_len_with_overhead);

assert(cutoff);
assert(cutoff >= utf8ness_cache);

/* This and all subsequent entries must be removed */
*cutoff = '\0';
utf8ness_cache_len = strlen(utf8ness_cache);
}

/* Make space for the new entry */
Move(utf8ness_cache,
utf8ness_cache + input_name_len_with_overhead,
utf8ness_cache_len, char);

/* And insert it */
Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';

assert((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1]
& (PERL_UINTMAX_T) ~1) == '0');
}

# ifdef DEBUGGING

if (DEBUG_Lv_TEST || debug_initialization) {
PerlIO_printf(Perl_debug_log,
"PL_locale_utf8ness is now %s; returning %d\n",
PL_locale_utf8ness, is_utf8);
}

# endif

Safefree(delimited);
Safefree(save_input_locale); Safefree(save_input_locale);
return FALSE; return is_utf8;
} }


#endif #endif
Expand Down

0 comments on commit f705e65

Please sign in to comment.