From 783c770ae9086c4ad4861209904a3a6b2e8ee964 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 11 Mar 2021 18:15:06 -0700 Subject: [PATCH] locale.c: querylocale() doesn't work on LC_ALL I had misread the man pages. This bug has been in the field for several releases now, but most likely hasn't shown up because it's almost always the case that the locale categories will be set to the same locale. And so most implementations of querylocale() would return the correct result. This commit works by splitting the calculation of the value of LC_ALL from S_emulate_setlocale_i() into a separate function, and extending it to work on querylocale() systems. This has the added benefit of removing tangential code from the main line, making S_emulate_setlocale_i easier to read. calculate_LC_ALL() is the new function, and is now called from two places. As part of this commit, constness is added to PL_curlocales[] Part of this change is to keep our records of LC_ALL on non-querylocale systems always up-to-date, which is better practice And part of this change is temporary, marked as such, to be removed a few commits later. --- embed.fnc | 5 ++ embed.h | 10 +++ intrpvar.h | 2 +- locale.c | 254 +++++++++++++++++++++++++++++++++-------------------- proto.h | 13 +++ 5 files changed, 186 insertions(+), 98 deletions(-) diff --git a/embed.fnc b/embed.fnc index 56d1c8a18765..ff70cd4b7c21 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3230,6 +3230,11 @@ iTR |const char *|save_to_buffer|NULLOK const char * string \ |const Size_t offset # if defined(USE_LOCALE) S |char* |stdize_locale |NN char* locs +# ifdef USE_QUERYLOCALE +S |const char *|calculate_LC_ALL|const locale_t cur_obj +# else +S |const char *|calculate_LC_ALL|NN const char ** individ_locales +# endif S |void |new_collate |NULLOK const char* newcoll S |void |new_ctype |NN const char* newctype Sr |void |setlocale_failure_panic_i|const unsigned int cat_index \ diff --git a/embed.h b/embed.h index 9ce7047db7c2..affa1c0a6e8d 100644 --- a/embed.h +++ b/embed.h @@ -1504,6 +1504,13 @@ # if !(defined(PERL_USE_3ARG_SIGHANDLER)) #define sighandler Perl_sighandler # endif +# if !(defined(USE_QUERYLOCALE)) +# if defined(PERL_IN_LOCALE_C) +# if defined(USE_LOCALE) +#define calculate_LC_ALL(a) S_calculate_LC_ALL(aTHX_ a) +# endif +# endif +# endif # if !(defined(_MSC_VER)) #define magic_regdatum_set(a,b) Perl_magic_regdatum_set(aTHX_ a,b) # endif @@ -1706,6 +1713,9 @@ #define emulate_setlocale_i(a,b) S_emulate_setlocale_i(aTHX_ a,b) #define my_querylocale_i(a) S_my_querylocale_i(aTHX_ a) # endif +# if defined(USE_QUERYLOCALE) +#define calculate_LC_ALL(a) S_calculate_LC_ALL(aTHX_ a) +# endif # if defined(WIN32) #define win32_setlocale(a,b) S_win32_setlocale(aTHX_ a,b) # endif diff --git a/intrpvar.h b/intrpvar.h index 88162843cab6..390dd95a9883 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -724,7 +724,7 @@ PERLVAR(I, padix_floor, PADOFFSET) /* how low may inner block reset padix */ /* This is the most number of categories we've encountered so far on any * platform */ -PERLVARA(I, curlocales, 12, char *) +PERLVARA(I, curlocales, 12, const char *) #endif #ifdef USE_LOCALE_COLLATE diff --git a/locale.c b/locale.c index 2b81fd832c7c..7c3c7cf803d8 100644 --- a/locale.c +++ b/locale.c @@ -501,12 +501,20 @@ S_category_name(const int category) # ifndef USE_QUERYLOCALE # define USE_PL_CURLOCALES # else +# define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask) + + /* This code used to think querylocale() was valid on LC_ALL. Make sure + * all instances of that have been removed */ +# define QUERYLOCALE_ASSERT(index) \ + __ASSERT_(isSINGLE_BIT_SET(category_masks[index])) # if ! defined(HAS_QUERYLOCALE) && defined(_NL_LOCALE_NAME) # define querylocale_l(index, locale_obj) \ - nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), locale_obj) + (QUERYLOCALE_ASSERT(index) \ + nl_langinfo_l(_NL_LOCALE_NAME(categories[index]), locale_obj)) # else # define querylocale_l(index, locale_obj) \ - querylocale(category_masks[index], locale_obj) + (QUERYLOCALE_ASSERT(index) \ + querylocale(category_masks[index], locale_obj)) # endif # endif # if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES) @@ -599,6 +607,7 @@ S_my_querylocale_i(pTHX_ const unsigned int index) int category; const locale_t cur_obj = uselocale((locale_t) 0); + const char * retval; PERL_ARGS_ASSERT_MY_QUERYLOCALE_I; assert(index <= NOMINAL_LC_ALL_INDEX); @@ -608,97 +617,31 @@ S_my_querylocale_i(pTHX_ const unsigned int index) DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: my_querylocale_i %p\n", __FILE__, __LINE__, cur_obj)); if (cur_obj == LC_GLOBAL_LOCALE) { - return porcelain_setlocale(category, NULL); + retval = porcelain_setlocale(category, NULL); } + else { # ifdef USE_QUERYLOCALE - return (char *) querylocale_l(index, cur_obj); + /* We don't currently keep records when there is querylocale(), so have + * to get it anew each time */ + retval = (index == LC_ALL_INDEX_) + ? calculate_LC_ALL(cur_obj) + : querylocale_l(index, cur_obj); # else - /* Without querylocale(), we have to use our record-keeping we've done. */ - if (category != LC_ALL) { - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: my_querylocale_i returning %s\n", - __FILE__, __LINE__, PL_curlocales[index])); - - return PL_curlocales[index]; - } - else { /* For LC_ALL */ - unsigned int i; - Size_t names_len = 0; - char * all_string; - bool are_all_categories_the_same_locale = TRUE; - - /* If we have a valid LC_ALL value, just return it */ - if (PL_curlocales[LC_ALL_INDEX_]) { - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: my_querylocale_i returning %s\n", - __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX_])); - - return PL_curlocales[LC_ALL_INDEX_]; - } - - /* Otherwise, we need to construct a string of name=value pairs. - * We use the glibc syntax, like - * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... - * First calculate the needed size. Along the way, check if all - * the locale names are the same */ - for (i = 0; i < LC_ALL_INDEX_; i++) { - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: my_querylocale_i i=%d, name=%s, locale=%s\n", - __FILE__, __LINE__, i, category_names[i], - PL_curlocales[i])); - - names_len += strlen(category_names[i]) - + 1 /* '=' */ - + strlen(PL_curlocales[i]) - + 1; /* ';' */ + /* But we do have up-to-date values when we keep our own records */ + retval = PL_curlocales[index]; - if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) { - are_all_categories_the_same_locale = FALSE; - } - } +# endif - /* If they are the same, we don't actually have to construct the - * string; we just make the entry in LC_ALL_INDEX_ valid, and be that - * single name */ - if (are_all_categories_the_same_locale) { - PL_curlocales[LC_ALL_INDEX_] = savepv(PL_curlocales[0]); - return PL_curlocales[LC_ALL_INDEX_]; } - names_len++; /* Trailing '\0' */ - SAVEFREEPV(Newx(all_string, names_len, char)); - *all_string = '\0'; - - /* Then fill in the string */ - for (i = 0; i < LC_ALL_INDEX_; i++) { - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: my_querylocale_i i=%d, name=%s, locale=%s\n", - __FILE__, __LINE__, i, category_names[i], - PL_curlocales[i])); - - my_strlcat(all_string, category_names[i], names_len); - my_strlcat(all_string, "=", names_len); - my_strlcat(all_string, PL_curlocales[i], names_len); - my_strlcat(all_string, ";", names_len); - } - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "%s:%d: my_querylocale_i returning %s\n", - __FILE__, __LINE__, all_string)); - - return all_string; - } - -# endif - + "%s:%d: my_querylocale_i(%s) returning '%s'\n", + __FILE__, __LINE__, category_names[index], retval)); + return retval; } STATIC const char * @@ -1074,7 +1017,7 @@ S_emulate_setlocale_i(pTHX_ const unsigned int index, const char * locale) # ifdef USE_QUERYLOCALE if (strEQ(locale, "")) { - locale = querylocale_l(index, new_obj); + locale = querylocale_i(index); } # else @@ -1098,20 +1041,19 @@ S_emulate_setlocale_i(pTHX_ const unsigned int index, const char * locale) } else { - /* For a single category, if it's not the same as the one in LC_ALL, we - * nullify LC_ALL */ + /* Otherwise, update the single category, plus LC_ALL */ + + Safefree(PL_curlocales[index]); + PL_curlocales[index] = savepv(locale); - if (PL_curlocales[LC_ALL_INDEX_] && strNE(PL_curlocales[LC_ALL_INDEX_], - locale)) + if ( PL_curlocales[LC_ALL_INDEX_] == NULL + || strNE(PL_curlocales[LC_ALL_INDEX_], locale)) { Safefree(PL_curlocales[LC_ALL_INDEX_]); - PL_curlocales[LC_ALL_INDEX_] = NULL; + PL_curlocales[LC_ALL_INDEX_] = + savepv(calculate_LC_ALL(PL_curlocales)); } - /* Then update the category's record */ - Safefree(PL_curlocales[index]); - PL_curlocales[index] = savepv(locale); - FIX_GLIBC_LC_MESSAGES_BUG(index); } @@ -1167,6 +1109,130 @@ S_stdize_locale(pTHX_ char *locs) return locs; } +STATIC +const char * + +# ifdef USE_QUERYLOCALE +S_calculate_LC_ALL(pTHX_ const locale_t cur_obj) +# else +S_calculate_LC_ALL(pTHX_ const char ** individ_locales) +# endif + +{ + /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed. + * querylocale(), on systems that have it, doesn't tend to work for LC_ALL. + * So we have to construct the answer ourselves based on the passed in + * data, which is either a locale_t object, for systems with querylocale(), + * or an array we keep updated to the proper values, otherwise. + * + * This returns a mortalized string containing the locale name(s) of + * LC_ALL. + * + * If all individual categories are the same locale, we can just set LC_ALL + * to that locale. But if not, we have to create an aggregation of all the + * categories on the system. Platforms differ as to the syntax they use + * for these non-uniform locales for LC_ALL. Some use a '/' or other + * delimiter of the locales with a predetermined order of categories; a + * Configure probe would be needed to tell us how to decipher those. glibc + * uses a series of name=value pairs, like + * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... + * The syntax we use for our aggregation doesn't much matter, as we take + * care not to use the native setlocale() function on whatever style is + * chosen. But, it would be possible for someone to call Perl_setlocale() + * using a native style we don't understand. So far no one has complained. + * + * For systems that have categories we don't know about, the algorithm + * below won't know about those missing categories, leading to potential + * bugs for code that looks at them. If there is an environment variable + * that sets that category, we won't know to look for it, and so our use of + * LANG or "C" improperly overrides it. On the other hand, if we don't do + * what is done here, and there is no environment variable, the category's + * locale should be set to LANG or "C". So there is no good solution. khw + * thinks the best is to make sure we have a complete list of possible + * categories, adding new ones as they show up on obscure platforms. + */ + + unsigned int i; + Size_t names_len = 0; + bool are_all_categories_the_same_locale = TRUE; + char * aggregate_locale; + char * previous_start = NULL; + char * this_start; + Size_t entry_len = 0; + + PERL_ARGS_ASSERT_CALCULATE_LC_ALL; + + /* First calculate the needed size for the string listing the categories + * and their locales. */ + for (i = 0; i < LC_ALL_INDEX_; i++) { + +# ifdef USE_QUERYLOCALE + const char * entry = querylocale_l(i, cur_obj); +# else + const char * entry = individ_locales[i]; +# endif + + if (entry == NULL) continue; /* XXX Temporary */ + names_len += strlen(category_names[i]) + + 1 /* '=' */ + + strlen(entry) + + 1; /* ';' */ + } + + names_len++; /* Trailing '\0' */ + + /* Allocate enough space for the aggregated string */ + SAVEFREEPV(Newxz(aggregate_locale, names_len, char)); + + /* Then fill it in */ + for (i = 0; i < LC_ALL_INDEX_; i++) { + Size_t new_len; + +# ifdef USE_QUERYLOCALE + const char * entry = querylocale_l(i, cur_obj); +# else + const char * entry = individ_locales[i]; +# endif + + if (entry == NULL) continue; /* XXX Temporary */ + new_len = my_strlcat(aggregate_locale, category_names[i], names_len); + assert(new_len <= names_len); + new_len = my_strlcat(aggregate_locale, "=", names_len); + assert(new_len <= names_len); + + this_start = aggregate_locale + strlen(aggregate_locale); + entry_len = strlen(entry); + + new_len = my_strlcat(aggregate_locale, entry, names_len); + assert(new_len <= names_len); + new_len = my_strlcat(aggregate_locale, ";", names_len); + assert(new_len <= names_len); + PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */ + + if ( i > 0 + && are_all_categories_the_same_locale + && memNE(previous_start, this_start, entry_len + 1)) + { + are_all_categories_the_same_locale = FALSE; + } + else { + previous_start = this_start; + } + } + + /* If they are all the same, just return any one of them */ + if (are_all_categories_the_same_locale) { + aggregate_locale = this_start; + aggregate_locale[entry_len] = '\0'; + } + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: calculate_LC_ALL returning '%s'\n", + __FILE__, __LINE__, aggregate_locale)); + + return aggregate_locale; +} + STATIC void S_setlocale_failure_panic_i(pTHX_ const unsigned int cat_index, @@ -3372,12 +3438,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) PL_numeric_radix_sv = newSVpvs("."); -# endif -# ifdef USE_PL_CURLOCALES - - /* Initialize our records. If we have POSIX 2008, we have LC_ALL */ - void_setlocale_c(LC_ALL, porcelain_setlocale(LC_ALL, NULL)); - # endif # ifdef LOCALE_ENVIRON_REQUIRED diff --git a/proto.h b/proto.h index a4dbb44302b8..b500abe315c8 100644 --- a/proto.h +++ b/proto.h @@ -4232,6 +4232,15 @@ PERL_CALLCONV Signal_t Perl_csighandler(int sig); PERL_CALLCONV Signal_t Perl_sighandler(int sig); #define PERL_ARGS_ASSERT_SIGHANDLER #endif +#if !(defined(USE_QUERYLOCALE)) +# if defined(PERL_IN_LOCALE_C) +# if defined(USE_LOCALE) +STATIC const char * S_calculate_LC_ALL(pTHX_ const char ** individ_locales); +#define PERL_ARGS_ASSERT_CALCULATE_LC_ALL \ + assert(individ_locales) +# endif +# endif +#endif #if !(defined(_MSC_VER)) PERL_CALLCONV_NO_RET int Perl_magic_regdatum_set(pTHX_ SV* sv, MAGIC* mg) __attribute__noreturn__; @@ -5152,6 +5161,10 @@ STATIC const char* S_emulate_setlocale_i(pTHX_ const unsigned int index, const c STATIC const char* S_my_querylocale_i(pTHX_ const unsigned int index); #define PERL_ARGS_ASSERT_MY_QUERYLOCALE_I # endif +# if defined(USE_QUERYLOCALE) +STATIC const char * S_calculate_LC_ALL(pTHX_ const locale_t cur_obj); +#define PERL_ARGS_ASSERT_CALCULATE_LC_ALL +# endif # if defined(WIN32) STATIC char* S_win32_setlocale(pTHX_ int category, const char* locale); #define PERL_ARGS_ASSERT_WIN32_SETLOCALE