diff --git a/embed.fnc b/embed.fnc index 594b679661b7..67256c801aa9 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3226,10 +3226,7 @@ SG |bool |sv_derived_from_svpvn |NULLOK SV *sv \ #if defined(PERL_IN_LOCALE_C) # ifdef USE_LOCALE -ST |const char*|category_name |const int category ST |unsigned int|get_category_index|const int category|NULLOK const char * locale -S |const char*|switch_category_locale_to_template|const int switch_category|const int template_category|NULLOK const char * template_locale -S |void |restore_switched_locale|const int category|NULLOK const char * const original_locale S |unsigned|get_locale_string_utf8ness_i \ |NULLOK const char * locale \ |const unsigned cat_index \ @@ -3332,15 +3329,6 @@ S |void |print_bytes_for_locale |NN const char * const s \ # endif #endif -#if defined(USE_LOCALE) \ - && ( defined(PERL_IN_LOCALE_C) \ - || defined(PERL_IN_MG_C) \ - || defined (PERL_EXT_POSIX) \ - || defined (PERL_EXT_LANGINFO)) -Cp |bool |_is_cur_LC_category_utf8|int category -#endif - - #if defined(PERL_IN_UTIL_C) S |SV* |mess_alloc S |SV * |with_queued_errors|NN SV *ex diff --git a/embed.h b/embed.h index 39004ad82935..0cd4e43b3a67 100644 --- a/embed.h +++ b/embed.h @@ -902,9 +902,6 @@ #define sv_dup(a,b) Perl_sv_dup(aTHX_ a,b) #define sv_dup_inc(a,b) Perl_sv_dup_inc(aTHX_ a,b) #endif -#if defined(USE_LOCALE) && ( defined(PERL_IN_LOCALE_C) || defined(PERL_IN_MG_C) || defined (PERL_EXT_POSIX) || defined (PERL_EXT_LANGINFO)) -#define _is_cur_LC_category_utf8(a) Perl__is_cur_LC_category_utf8(aTHX_ a) -#endif #if defined(USE_LOCALE_COLLATE) #define sv_collxfrm_flags(a,b,c) Perl_sv_collxfrm_flags(aTHX_ a,b,c) #endif @@ -1726,7 +1723,6 @@ # endif # if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -#define category_name S_category_name #define get_category_index S_get_category_index #define get_locale_string_utf8ness_i(a,b,c,d) S_get_locale_string_utf8ness_i(aTHX_ a,b,c,d) #define is_codeset_name_UTF8 S_is_codeset_name_UTF8 @@ -1735,13 +1731,11 @@ #define new_collate(a) S_new_collate(aTHX_ a) #define new_ctype(a) S_new_ctype(aTHX_ a) #define new_numeric(a) S_new_numeric(aTHX_ a) -#define restore_switched_locale(a,b) S_restore_switched_locale(aTHX_ a,b) #define restore_toggled_locale_i(a,b) S_restore_toggled_locale_i(aTHX_ a,b) #define save_to_buffer S_save_to_buffer #define set_numeric_radix(a) S_set_numeric_radix(aTHX_ a) #define setlocale_failure_panic_i(a,b,c,d,e) S_setlocale_failure_panic_i(aTHX_ a,b,c,d,e) #define stdize_locale(a,b,c,d) S_stdize_locale(aTHX_ a,b,c,d) -#define switch_category_locale_to_template(a,b,c) S_switch_category_locale_to_template(aTHX_ a,b,c) #define toggle_locale_i(a,b) S_toggle_locale_i(aTHX_ a,b) # if defined(USE_POSIX_2008_LOCALE) #define emulate_setlocale_i(a,b,c,d) S_emulate_setlocale_i(aTHX_ a,b,c,d) diff --git a/embedvar.h b/embedvar.h index 004e5556220c..d1829b0e09e7 100644 --- a/embedvar.h +++ b/embedvar.h @@ -189,7 +189,6 @@ #define PL_laststatval (vTHX->Ilaststatval) #define PL_laststype (vTHX->Ilaststype) #define PL_lc_numeric_mutex_depth (vTHX->Ilc_numeric_mutex_depth) -#define PL_locale_utf8ness (vTHX->Ilocale_utf8ness) #define PL_localizing (vTHX->Ilocalizing) #define PL_localpatches (vTHX->Ilocalpatches) #define PL_lockhook (vTHX->Ilockhook) diff --git a/intrpvar.h b/intrpvar.h index 92718e2af100..ed08fb3972b7 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -370,7 +370,6 @@ PERLVAR(I, in_utf8_turkic_locale, bool) #if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) PERLVARI(I, lc_numeric_mutex_depth, int, 0) /* Emulate general semaphore */ #endif -PERLVARA(I, locale_utf8ness, 256, char) #ifdef USE_LOCALE_CTYPE PERLVAR(I, warn_locale, SV *) diff --git a/locale.c b/locale.c index 9f092d067d4d..f9e90d2b94fb 100644 --- a/locale.c +++ b/locale.c @@ -162,29 +162,6 @@ static const char C_thousands_sep[] = ""; #ifdef USE_LOCALE -/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far - * looked up. This is in the form of a C string: */ - -# define UTF8NESS_SEP "\v" -# define UTF8NESS_PREFIX "\f" - -/* So, the string looks like: - * - * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 - * - * where the digit 0 after the \a indicates that the locale starting just - * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ - -STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); -STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); - -# define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ - UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" - -/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are - * kept there always. The remining portion of the cache is LRU, with the - * oldest looked-up locale at the tail end */ - # ifdef DEBUGGING # define setlocale_debug_string_c(category, locale, result) \ setlocale_debug_string_i(category##_INDEX_, locale, result) @@ -427,20 +404,6 @@ S_get_category_index(const int category, const char * locale) return NOMINAL_LC_ALL_INDEX + 1; } -STATIC const char * -S_category_name(const int category) -{ - unsigned int index; - - index = get_category_index(category, NULL); - - if (index <= NOMINAL_LC_ALL_INDEX) { - return category_names[index]; - } - - return Perl_form_nocontext("%d (unknown)", category); -} - #endif /* ifdef USE_LOCALE */ #ifdef USE_POSIX_2008_LOCALE @@ -4690,12 +4653,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # ifdef HAS_WCTOMBR wcrtomb(NULL, L'\0', &PL_wcrtomb_ps); # endif - - /* Initialize the cache of the program's UTF-8ness for the always known - * locales C and POSIX */ - my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, - sizeof(PL_locale_utf8ness)); - # ifdef USE_THREAD_SAFE_LOCALE # ifdef WIN32 @@ -5075,24 +5032,6 @@ Perl_init_i18nl10n(pTHX_ int printwarn) new_LC_ALL(NULL); for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { - -# if defined(USE_LOCALE_THREADS) && ! defined(USE_THREAD_SAFE_LOCALE) - - /* This caches whether each category's locale is UTF-8 or not. This - * may involve changing the locale. It is ok to do this at - * initialization time before any threads have started, but not later - * unless thread-safe operations are used. - * Caching means that if the program heeds our dictate not to change - * locales in threaded applications, this data will remain valid, and - * it may get queried without having to change locales. If the - * environment is such that all categories have the same locale, this - * isn't needed, as the code will not change the locale; but this - * handles the uncommon case where the environment has disparate - * locales for the categories */ - (void) _is_cur_LC_category_utf8(categories[i]); - -# endif - Safefree(curlocales[i]); } @@ -5781,682 +5720,6 @@ S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index, Safefree(restore_locale); } -STATIC const char * -S_switch_category_locale_to_template(pTHX_ const int switch_category, - const int template_category, - const char * template_locale) -{ - /* Changes the locale for LC_'switch_category" to that of - * LC_'template_category', if they aren't already the same. If not NULL, - * 'template_locale' is the locale that 'template_category' is in. - * - * Returns a copy of the name of the original locale for 'switch_category' - * so can be switched back to with the companion function - * restore_switched_locale(), (NULL if no restoral is necessary.) */ - - const char * restore_to_locale = NULL; - - if (switch_category == template_category) { /* No changes needed */ - return NULL; - } - - /* Find the original locale of the category we may need to change, so that - * it can be restored to later */ - restore_to_locale = querylocale_r(switch_category); - if (! restore_to_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(switch_category), errno); - } - restore_to_locale = savepv(restore_to_locale); - - /* If the locale of the template category wasn't passed in, find it now */ - if (template_locale == NULL) { - template_locale = querylocale_r(template_category); - if (! template_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(template_category), errno); - } - } - - /* It the locales are the same, there's nothing to do */ - if (strEQ(restore_to_locale, template_locale)) { - Safefree(restore_to_locale); - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", - category_name(switch_category), template_locale)); - - return NULL; - } - - /* Finally, change the locale to the template one */ - if (! bool_setlocale_r(switch_category, template_locale)) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not change %s locale to %s, errno=%d\n", - __FILE__, __LINE__, category_name(switch_category), - template_locale, errno); - } - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n", - category_name(switch_category), template_locale)); - - return restore_to_locale; -} - -STATIC void -S_restore_switched_locale(pTHX_ const int category, - const char * const original_locale) -{ - /* Restores the locale for LC_'category' to 'original_locale' (which is a - * copy that will be freed by this function), or do nothing if the latter - * parameter is NULL */ - - if (original_locale == NULL) { - return; - } - - if (! bool_setlocale_r(category, original_locale)) { - Perl_croak(aTHX_ - "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n", - __FILE__, __LINE__, - category_name(category), original_locale, errno); - } - - Safefree(original_locale); -} - -/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */ -# define CUR_LC_BUFFER_SIZE 64 - -bool -Perl__is_cur_LC_category_utf8(pTHX_ int category) -{ - /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE - * otherwise. 'category' may not be LC_ALL. If the platform doesn't have - * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence - * could give the wrong result. The result will very likely be correct for - * languages that have commonly used non-ASCII characters, but for notably - * English, it comes down to if the locale's name ends in something like - * "UTF-8". It errs on the side of not being a UTF-8 locale. - * - * If the platform is early C89, not containing mbtowc(), or we are - * compiled to not pay attention to LC_CTYPE, this employs heuristics. - * These work very well for non-Latin locales or those whose currency - * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at - * least MB_CUR_MAX, English locales with an ASCII currency symbol depend - * on the name containing UTF-8 or not. */ - - /* Name of current locale corresponding to the input category */ - const char *save_input_locale = NULL; - - bool is_utf8 = FALSE; /* The return value */ - - /* The variables below are for the cache of previous lookups using this - * function. The cache is a C string, described at the definition for - * 'C_and_POSIX_utf8ness'. - * - * The first part of the cache is fixed, for the C and POSIX locales. The - * varying part starts just after them. */ - char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); - - Size_t utf8ness_cache_size; /* Size of the varying portion */ - Size_t input_name_len; /* Length in bytes of save_input_locale */ - Size_t input_name_len_with_overhead; /* plus extra chars used to store - the name in the cache */ - char * delimited; /* The name plus the delimiters used to store - it in the cache */ - char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */ - char * name_pos; /* position of 'delimited' in the cache, or 0 - if not there */ - - -# ifdef LC_ALL - - assert(category != LC_ALL); - -# endif - - /* Get the desired category's locale */ - save_input_locale = querylocale_r(category); - if (! save_input_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(category), errno); - } - save_input_locale = savepv(save_input_locale); - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for %s is %s\n", - category_name(category), save_input_locale)); - - input_name_len = strlen(save_input_locale); - - /* In our cache, each name is accompanied by two delimiters and a single - * utf8ness digit */ - input_name_len_with_overhead = input_name_len + 3; - - if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) { - /* we can use the buffer, avoid a malloc */ - delimited = buffer; - } else { /* need a malloc */ - /* Allocate and populate space for a copy of the name surrounded by the - * delimiters */ - Newx(delimited, input_name_len_with_overhead, char); - } - - delimited[0] = UTF8NESS_SEP[0]; - Copy(save_input_locale, delimited + 1, input_name_len, char); - delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; - delimited[input_name_len+2] = '\0'; - - /* And see if that is in the cache */ - name_pos = instr(PL_locale_utf8ness, delimited); - if (name_pos) { - is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "UTF8ness for locale %s=%d, \n", - save_input_locale, is_utf8)); - - /* And, if not already in that position, move it to the beginning of - * the non-constant portion of the list, since it is the most recently - * used. (We don't have to worry about overflow, since just moving - * existing names around) */ - if (name_pos > utf8ness_cache) { - Move(utf8ness_cache, - utf8ness_cache + input_name_len_with_overhead, - name_pos - utf8ness_cache, char); - Copy(delimited, - utf8ness_cache, - input_name_len_with_overhead - 1, char); - utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; - } - - /* free only when not using the buffer */ - if ( delimited != buffer ) Safefree(delimited); - Safefree(save_input_locale); - return is_utf8; - } - - /* Here we don't have stored the utf8ness for the input locale. We have to - * calculate it */ - -# if defined(USE_LOCALE_CTYPE) \ - && ( defined(HAS_SOME_LANGINFO) \ - || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) - - { - const char *original_ctype_locale - = switch_category_locale_to_template(LC_CTYPE, - category, - save_input_locale); - - /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and mbtowc() - * should give the correct results */ - -# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding - calling the functions if we have this */ - - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. */ - - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n", - __FILE__, __LINE__, (int) MB_CUR_MAX)); - if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { - is_utf8 = FALSE; - restore_switched_locale(LC_CTYPE, original_ctype_locale); - goto finish_and_return; - } - -# endif -# if defined(HAS_SOME_LANGINFO) - - { /* The task is easiest if the platform has this POSIX 2001 function. - Except on some platforms it can wrongly return "", so have to have - a fallback. And it can return that it's UTF-8, even if there are - variances from that. For example, Turkish locales may use the - alternate dotted I rules, and sometimes it appears to be a - defective locale definition. XXX We should probably check for - these in the Latin1 range and warn (but on glibc, requires - iswalnum() etc. due to their not handling 80-FF correctly */ - const char * scratch_buffer = NULL; - const char *codeset = my_langinfo_c(CODESET, LC_CTYPE, NULL, - &scratch_buffer, NULL, NULL); - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\tnllanginfo returned CODESET '%s'\n", codeset)); - - if (codeset && strNE(codeset, "")) { - - /* If the implementation of foldEQ() somehow were - * to change to not go byte-by-byte, this could - * read past end of string, as only one length is - * checked. But currently, a premature NUL will - * compare false, and it will stop there */ - is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) - || foldEQ(codeset, STR_WITH_LEN("UTF8"))); - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", - codeset, is_utf8)); - restore_switched_locale(LC_CTYPE, original_ctype_locale); - Safefree(scratch_buffer); - goto finish_and_return; - } - } - -# endif -# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) - /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a - * late adder to C89, so very likely to have it. However, testing has - * shown that, like nl_langinfo() above, there are locales that are not - * strictly UTF-8 that this will return that they are */ - { - wchar_t wc; - int len; - - PERL_UNUSED_RESULT(mbtowc_(NULL, NULL, 0)); - len = mbtowc_(&wc, REPLACEMENT_CHARACTER_UTF8, - STRLENs(REPLACEMENT_CHARACTER_UTF8)); - - is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) - && wc == (wchar_t) UNICODE_REPLACEMENT); - } - -# endif - - restore_switched_locale(LC_CTYPE, original_ctype_locale); - goto finish_and_return; - } - -# else - - /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next - * try looking at the currency symbol to see if it disambiguates - * things. Often that will be in the native script, and if the symbol - * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII - * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 - * string being valid UTF-8 are quite small */ - -# ifdef USE_LOCALE_MONETARY - - /* If have LC_MONETARY, we can look at the currency symbol. Often that - * will be in the native script. We do this one first because there is - * just one string to examine, so potentially avoids work */ - - { - const char *original_monetary_locale - = switch_category_locale_to_template(LC_MONETARY, - category, - save_input_locale); - bool only_ascii = FALSE; - const char * scratch_buffer = NULL; - const U8 * currency_string - = (const U8 *) my_langinfo_c(CRNCYSTR, LC_MONETARY, - save_input_locale, - &scratch_buffer, NULL, NULL); - /* 2nd param not relevant for this item */ - const U8 * first_variant; - - assert( *currency_string == '-' - || *currency_string == '+' - || *currency_string == '.'); - - currency_string++; - - if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant)) - { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Couldn't get currency symbol for %s, or contains" - " only ASCII; can't use for determining if UTF-8" - " locale\n", save_input_locale)); - only_ascii = TRUE; - } - else { - is_utf8 = is_strict_utf8_string(first_variant, 0); - } - Safefree(scratch_buffer); - - restore_switched_locale(LC_MONETARY, original_monetary_locale); - - if (! only_ascii) { - - /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; - * otherwise assume the locale is UTF-8 if and only if the symbol - * is non-ascii UTF-8. */ - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\t?Currency symbol for %s is UTF-8=%d\n", - save_input_locale, is_utf8)); - goto finish_and_return; - } - } - -# endif /* USE_LOCALE_MONETARY */ -# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) - - /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try - * the names of the months and weekdays, timezone, and am/pm indicator */ - { - const char *original_time_locale - = switch_category_locale_to_template(LC_TIME, - category, - save_input_locale); - int hour = 10; - bool is_dst = FALSE; - int dom = 1; - int month = 0; - int i; - char * formatted_time; - - /* Here the current LC_TIME is set to the locale of the category - * whose information is desired. Look at all the days of the week - * and month names, and the timezone and am/pm indicator for UTF-8 - * variant characters. The first such a one found will tell us if - * the locale is UTF-8 or not */ - - for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ - formatted_time = my_strftime("%A %B %Z %p", - 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); - if ( ! formatted_time - || is_utf8_invariant_string((U8 *) formatted_time, 0)) - { - - /* Here, we didn't find a non-ASCII. Try the next time - * through with the complemented dst and am/pm, and try - * with the next weekday. After we have gotten all - * weekdays, try the next month */ - is_dst = ! is_dst; - hour = (hour + 12) % 24; - dom++; - if (i > 6) { - month++; - } - continue; - } - - /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; - * false otherwise. But first, restore LC_TIME to its original - * locale if we changed it */ - restore_switched_locale(LC_TIME, original_time_locale); - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\t?time-related strings for %s are UTF-8=%d\n", - save_input_locale, - is_utf8_string((U8 *) formatted_time, 0))); - is_utf8 = is_utf8_string((U8 *) formatted_time, 0); - goto finish_and_return; - } - - /* Falling off the end of the loop indicates all the names were just - * ASCII. Go on to the next test. If we changed it, restore LC_TIME - * to its original locale */ - restore_switched_locale(LC_TIME, original_time_locale); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "All time-related words for %s contain only ASCII;" - " can't use for determining if UTF-8 locale\n", - save_input_locale)); - } - -# endif - -# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) - - /* This code is ifdefd out because it was found to not be necessary in - * testing on our dromedary test machine, which has over 700 locales. - * There, this added no value to looking at the currency symbol and the - * time strings. I left it in so as to avoid rewriting it if real-world - * experience indicates that dromedary is an outlier. Essentially, instead - * of returning abpve if we haven't found illegal utf8, we continue on and - * examine all the strerror() messages on the platform for utf8ness. If - * all are ASCII, we still don't know the answer; but otherwise we have a - * pretty good indication of the utf8ness. The reason this doesn't help - * much is that the messages may not have been translated into the locale. - * The currency symbol and time strings are much more likely to have been - * translated. */ - { - int e; - bool non_ascii = FALSE; - const char *original_messages_locale - = switch_category_locale_to_template(LC_MESSAGES, - category, - save_input_locale); - const char * errmsg = NULL; - - /* Here the current LC_MESSAGES is set to the locale of the category - * whose information is desired. Look through all the messages. We - * can't use Strerror() here because it may expand to code that - * segfaults in miniperl */ - - for (e = 0; e <= sys_nerr; e++) { - errno = 0; - errmsg = sys_errlist[e]; - if (errno || !errmsg) { - break; - } - errmsg = savepv(errmsg); - if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { - non_ascii = TRUE; - is_utf8 = is_utf8_string((U8 *) errmsg, 0); - break; - } - } - Safefree(errmsg); - - restore_switched_locale(LC_MESSAGES, original_messages_locale); - - if (non_ascii) { - - /* Any non-UTF-8 message means not a UTF-8 locale; if all are - * valid, any non-ascii means it is one; otherwise we assume it - * isn't */ - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\t?error messages for %s are UTF-8=%d\n", - save_input_locale, - is_utf8)); - goto finish_and_return; - } - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "All error messages for %s contain only ASCII;" - " can't use for determining if UTF-8 locale\n", - save_input_locale)); - } - -# endif -# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a - UTF-8 locale */ - - /* As a last resort, look at the locale name to see if it matches - * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the - * return of setlocale(), is actually defined to be opaque, so we can't - * really rely on the absence of various substrings in the name to indicate - * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to - * be a UTF-8 locale. Similarly for the other common names */ - - { - const Size_t final_pos = strlen(save_input_locale) - 1; - - if (final_pos >= 3) { - const char *name = save_input_locale; - - /* Find next 'U' or 'u' and look from there */ - while ((name += strcspn(name, "Uu") + 1) - <= save_input_locale + final_pos - 2) - { - if ( isALPHA_FOLD_NE(*name, 't') - || isALPHA_FOLD_NE(*(name + 1), 'f')) - { - continue; - } - name += 2; - if (*(name) == '-') { - if ((name > save_input_locale + final_pos - 1)) { - break; - } - name++; - } - if (*(name) == '8') { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with UTF-8 in name\n", - save_input_locale)); - is_utf8 = TRUE; - goto finish_and_return; - } - } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s doesn't end with UTF-8 in name\n", - save_input_locale)); - } - -# ifdef WIN32 - - /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ - if (memENDs(save_input_locale, final_pos, "65001")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with 65001 in name, is UTF-8 locale\n", - save_input_locale)); - is_utf8 = TRUE; - goto finish_and_return; - } - -# endif - } -# endif - - /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But - * since we are about to return FALSE anyway, there is no point in doing - * this extra work */ - -# if 0 - if (instr(save_input_locale, "8859")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s has 8859 in name, not UTF-8 locale\n", - save_input_locale)); - is_utf8 = FALSE; - goto finish_and_return; - } -# endif - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Assuming locale %s is not a UTF-8 locale\n", - save_input_locale)); - is_utf8 = FALSE; - -# endif /* the code that is compiled when no modern LC_CTYPE */ - - finish_and_return: - - /* Cache this result so we don't have to go through all this next time. */ - utf8ness_cache_size = sizeof(PL_locale_utf8ness) - - (utf8ness_cache - PL_locale_utf8ness); - - /* But we can't save it if it is too large for the total space available */ - if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { - Size_t utf8ness_cache_len = strlen(utf8ness_cache); - - /* Here it can fit, but we may need to clear out the oldest cached - * result(s) to do so. Check */ - if (utf8ness_cache_len + input_name_len_with_overhead - >= utf8ness_cache_size) - { - /* Here we have to clear something out to make room for this. - * Start looking at the rightmost place where it could fit and find - * the beginning of the entry that extends past that. */ - char * cutoff = (char *) my_memrchr(utf8ness_cache, - UTF8NESS_SEP[0], - utf8ness_cache_size - - input_name_len_with_overhead); - - assert(cutoff); - assert(cutoff >= utf8ness_cache); - - /* This and all subsequent entries must be removed */ - *cutoff = '\0'; - utf8ness_cache_len = strlen(utf8ness_cache); - } - - /* Make space for the new entry */ - Move(utf8ness_cache, - utf8ness_cache + input_name_len_with_overhead, - utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); - - /* And insert it */ - Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); - utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; - - if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu," - " inserted_name=%s, its_len=%zu\n", - __FILE__, __LINE__, - PL_locale_utf8ness, strlen(PL_locale_utf8ness), - delimited, input_name_len_with_overhead); - } - } - -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST) { - const char * s = PL_locale_utf8ness; - - /* Audit the structure */ - while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) { - const char *e; - - if (*s != UTF8NESS_SEP[0]) { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: missing" - " separator %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (s - PL_locale_utf8ness), PL_locale_utf8ness, - s); - } - s++; - e = strchr(s, UTF8NESS_PREFIX[0]); - if (! e) { - e = PL_locale_utf8ness + strlen(PL_locale_utf8ness); - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: missing" - " separator %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, - e); - } - e++; - if (*e != '0' && *e != '1') { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: utf8ness" - " must be [01] %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e + 1 - PL_locale_utf8ness), - PL_locale_utf8ness, e + 1); - } - if (ninstr(PL_locale_utf8ness, s, s-1, e)) { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: entry" - " has duplicate %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, - e); - } - s = e + 1; - } - } - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "PL_locale_utf8ness is now %s; returning %d\n", - PL_locale_utf8ness, is_utf8)); - -# endif - - /* free only when not using the buffer */ - if ( delimited != buffer ) Safefree(delimited); - Safefree(save_input_locale); - return is_utf8; -} - STATIC bool S_is_codeset_name_UTF8(const char * name) { diff --git a/proto.h b/proto.h index 43a29cd0bc64..e6ab8eb7ff62 100644 --- a/proto.h +++ b/proto.h @@ -5161,8 +5161,6 @@ PERL_CALLCONV SV* Perl_hfree_next_entry(pTHX_ HV *hv, STRLEN *indexp); #endif #if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -STATIC const char* S_category_name(const int category); -#define PERL_ARGS_ASSERT_CATEGORY_NAME STATIC unsigned int S_get_category_index(const int category, const char * locale); #define PERL_ARGS_ASSERT_GET_CATEGORY_INDEX STATIC unsigned S_get_locale_string_utf8ness_i(pTHX_ const char * locale, const unsigned cat_index, const char * string, const int known_utf8); @@ -5182,8 +5180,6 @@ STATIC void S_new_ctype(pTHX_ const char* newctype); assert(newctype) STATIC void S_new_numeric(pTHX_ const char* newnum); #define PERL_ARGS_ASSERT_NEW_NUMERIC -STATIC void S_restore_switched_locale(pTHX_ const int category, const char * const original_locale); -#define PERL_ARGS_ASSERT_RESTORE_SWITCHED_LOCALE STATIC void S_restore_toggled_locale_i(pTHX_ const unsigned cat_index, const char * original_locale); #define PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I STATIC const char * S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size) @@ -5199,8 +5195,6 @@ PERL_STATIC_NO_RET void S_setlocale_failure_panic_i(pTHX_ const unsigned int cat STATIC const char* S_stdize_locale(pTHX_ const int category, const char* input_locale, const char **buf, Size_t *buf_size); #define PERL_ARGS_ASSERT_STDIZE_LOCALE -STATIC const char* S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale); -#define PERL_ARGS_ASSERT_SWITCH_CATEGORY_LOCALE_TO_TEMPLATE STATIC const char * S_toggle_locale_i(pTHX_ const unsigned switch_cat_index, const char * new_locale); #define PERL_ARGS_ASSERT_TOGGLE_LOCALE_I \ assert(new_locale) @@ -6898,10 +6892,6 @@ PERL_CALLCONV SV* Perl_sv_dup_inc(pTHX_ const SV *const ssv, CLONE_PARAMS *const #define PERL_ARGS_ASSERT_SV_DUP_INC \ assert(param) -#endif -#if defined(USE_LOCALE) && ( defined(PERL_IN_LOCALE_C) || defined(PERL_IN_MG_C) || defined (PERL_EXT_POSIX) || defined (PERL_EXT_LANGINFO)) -PERL_CALLCONV bool Perl__is_cur_LC_category_utf8(pTHX_ int category); -#define PERL_ARGS_ASSERT__IS_CUR_LC_CATEGORY_UTF8 #endif #if defined(USE_LOCALE_COLLATE) PERL_CALLCONV int Perl_magic_freecollxfrm(pTHX_ SV* sv, MAGIC* mg); diff --git a/sv.c b/sv.c index 819ae00a71bb..7a71f2f81d99 100644 --- a/sv.c +++ b/sv.c @@ -15341,7 +15341,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8locale = proto_perl->Iutf8locale; PL_in_utf8_CTYPE_locale = proto_perl->Iin_utf8_CTYPE_locale; PL_in_utf8_COLLATE_locale = proto_perl->Iin_utf8_COLLATE_locale; - my_strlcpy(PL_locale_utf8ness, proto_perl->Ilocale_utf8ness, sizeof(PL_locale_utf8ness)); #if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) PL_lc_numeric_mutex_depth = 0; #endif