From 5d8dd030a7b07f5ea8e37a449e293e761b455606 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 1 Mar 2021 18:07:19 -0700 Subject: [PATCH] locale.c: Add utf8ness return param to static fcn my_langinfo_i() now will additionally return the UTF-8ness of the returned string. --- embed.fnc | 6 ++-- embed.h | 4 +-- locale.c | 84 ++++++++++++++++++++++++++++++++++++++++--------------- proto.h | 4 +-- 4 files changed, 69 insertions(+), 29 deletions(-) diff --git a/embed.fnc b/embed.fnc index ca7ab81a7b90..cdc8dde6b85b 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3242,13 +3242,15 @@ S |const char*|my_langinfo_i|const nl_item item \ |const unsigned int cat_index \ |NULLOK const char * locale \ |NN const char ** retbufp \ - |NULLOK Size_t * retbuf_sizep + |NULLOK Size_t * retbuf_sizep \ + |NULLOK int * utf8ness # else S |const char*|my_langinfo_i|const int item \ |const unsigned int cat_index \ |NULLOK const char * locale \ |NN const char ** retbufp \ - |NULLOK Size_t * retbuf_sizep + |NULLOK Size_t * retbuf_sizep \ + |NULLOK int * utf8ness # if (defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)) \ && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC)) S |HV * |get_nl_item_from_localeconv \ diff --git a/embed.h b/embed.h index 898b2f46091b..e36480d77cfc 100644 --- a/embed.h +++ b/embed.h @@ -1513,7 +1513,7 @@ # endif # if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -#define my_langinfo_i(a,b,c,d,e) S_my_langinfo_i(aTHX_ a,b,c,d,e) +#define my_langinfo_i(a,b,c,d,e,f) S_my_langinfo_i(aTHX_ a,b,c,d,e,f) # endif # endif # endif @@ -1631,7 +1631,7 @@ # if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L) # if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -#define my_langinfo_i(a,b,c,d,e) S_my_langinfo_i(aTHX_ a,b,c,d,e) +#define my_langinfo_i(a,b,c,d,e,f) S_my_langinfo_i(aTHX_ a,b,c,d,e,f) # endif # endif # endif diff --git a/locale.c b/locale.c index aa1f205558d6..4fa662a82054 100644 --- a/locale.c +++ b/locale.c @@ -156,8 +156,9 @@ static const char C_thousands_sep[] = ""; /* For use in calling my_langinfo() */ #define USE_UNDERLYING_NUMERIC ((char *) 1) -#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep) \ - my_langinfo_i(item, category##_INDEX_, locale, retbufp, retbuf_sizep) +#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \ + my_langinfo_i(item, category##_INDEX_, locale, retbufp, \ + retbuf_sizep, utf8ness) #ifdef USE_LOCALE @@ -1520,6 +1521,7 @@ S_set_numeric_radix(pTHX_ const bool use_locale) # if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LOCALECONV) \ || defined(HAS_SOME_LANGINFO)) + int utf8ness = 1; const char * radix; const char * scratch_buffer = NULL; @@ -1529,18 +1531,13 @@ S_set_numeric_radix(pTHX_ const bool use_locale) else { radix = my_langinfo_c(RADIXCHAR, LC_NUMERIC, USE_UNDERLYING_NUMERIC, - &scratch_buffer, NULL); + &scratch_buffer, NULL, &utf8ness); } sv_setpv(PL_numeric_radix_sv, radix); Safefree(scratch_buffer); - /* If this is valid UTF-8 that isn't totally ASCII, and we are in - * a UTF-8 locale, then mark the radix as being in UTF-8 */ - if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), - SvCUR(PL_numeric_radix_sv)) - && _is_cur_LC_category_utf8(LC_NUMERIC)) - { + if (utf8ness > 1) { SvUTF8_on(PL_numeric_radix_sv); } @@ -1634,12 +1631,12 @@ S_new_numeric(pTHX_ const char *newnum) PL_numeric_standard = strEQ(C_decimal_point, my_langinfo_c(RADIXCHAR, LC_NUMERIC, NULL, &scratch_buffer, - &buf_size)) + &buf_size, NULL)) && strEQ(C_thousands_sep, my_langinfo_c(THOUSEP, LC_NUMERIC, NULL, &scratch_buffer, - &buf_size)); + &buf_size, NULL)); Safefree(scratch_buffer); } @@ -2021,8 +2018,8 @@ S_new_ctype(pTHX_ const char *newctype) { const char * scratch_buffer = NULL; Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", - my_langinfo_c(CODESET, LC_CTYPE, - NULL, &scratch_buffer, NULL)); + my_langinfo_c(CODESET, LC_CTYPE, NULL, + &scratch_buffer, NULL, NULL)); Safefree(scratch_buffer); } @@ -3683,11 +3680,11 @@ Perl_langinfo(const nl_item item) /* Use either the underlying numeric, or the other underlying categories */ if (cat_index == LC_NUMERIC_INDEX_) { return my_langinfo_c(item, LC_NUMERIC, USE_UNDERLYING_NUMERIC, - &PL_langinfo_buf, &PL_langinfo_bufsize); + &PL_langinfo_buf, &PL_langinfo_bufsize, NULL); } else { return my_langinfo_i(item, cat_index, NULL, - &PL_langinfo_buf, &PL_langinfo_bufsize); + &PL_langinfo_buf, &PL_langinfo_bufsize, NULL); } #endif @@ -3714,7 +3711,11 @@ S_my_langinfo_i(pTHX_ * empty-on-entry, single use buffer whose size we don't need * to keep track of */ const char ** retbufp, - Size_t * retbuf_sizep) + Size_t * retbuf_sizep, + + /* If not NULL, the location to store the UTF8-ness of 'item's + * value, as documented */ + int * utf8ness) { const char * retval; @@ -3745,10 +3746,12 @@ S_my_langinfo_i(pTHX_ if (locale == NULL) { cur = use_curlocale_scratch(); + locale = NULL; } else if (locale == USE_UNDERLYING_NUMERIC) { assert(cat_index == LC_NUMERIC_INDEX_); cur = PL_underlying_numeric_obj; + locale = PL_numeric_name; } else { cur = newlocale(category_masks[cat_index], locale, (locale_t) 0); @@ -3757,6 +3760,11 @@ S_my_langinfo_i(pTHX_ retval = save_to_buffer(nl_langinfo_l(item, cur), retbufp, retbuf_sizep); + if (utf8ness) { + *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, + retval, UTF8NESS_UNKNOWN); + } + if (need_free) { freelocale(cur); } @@ -3782,6 +3790,9 @@ S_my_langinfo_i(pTHX_ else if (locale != NULL) { orig_switched_locale = toggle_locale_i(cat_index, locale); } + else { + locale = NULL; + } NL_LANGINFO_LOCK; retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep); @@ -3794,6 +3805,11 @@ S_my_langinfo_i(pTHX_ restore_toggled_locale_i(cat_index, orig_switched_locale); } + if (utf8ness) { + *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, + retval, UTF8NESS_UNKNOWN); + } + return retval; } /*--------------------------------------------------------------------------*/ @@ -3803,6 +3819,12 @@ S_my_langinfo_i(pTHX_ * nl_langinfo(). There are various possibilities depending on the * Configuration */ + /* Almost all the items will have ASCII return values. Set that here, and + * override if necessary */ + if (utf8ness) { + *utf8ness = 1; + } + /* If the desired locale to get the information about isn't the current * one, switch to it, and call ourselves recursively */ if (locale == NULL) { @@ -3817,7 +3839,7 @@ S_my_langinfo_i(pTHX_ const char * orig_switched_locale = toggle_locale_i(cat_index, want_locale); retval = my_langinfo_i(item, cat_index, NULL, - retbufp, retbuf_sizep); + retbufp, retbuf_sizep, utf8ness); restore_toggled_locale_i(cat_index, orig_switched_locale); return retval; } @@ -3879,6 +3901,11 @@ S_my_langinfo_i(pTHX_ *floatbuf = '\0'; retval = save_to_buffer(item_start, retbufp, retbuf_sizep); + if (utf8ness) { + *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, retval, + UTF8NESS_UNKNOWN); + } + return retval; } @@ -3909,6 +3936,12 @@ S_my_langinfo_i(pTHX_ retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep); + if (utf8ness) { + *utf8ness = get_locale_string_utf8ness_i(locale, cat_index, + retval, + UTF8NESS_UNKNOWN); + } + SvREFCNT_dec_NN(string); return retval; } @@ -4025,8 +4058,8 @@ S_my_langinfo_i(pTHX_ /* The year was deliberately chosen so that January 1 is on the * first day of the week. Since we're only getting one thing at a * time, it all works */ - temp = my_strftime(format, 30, 30, hour, mday, mon, - 2011, 0, 0, 0); + temp = my_strftime8(format, 30, 30, hour, mday, mon, + 2011, 0, 0, 0, utf8ness); retval = save_to_buffer(temp, retbufp, retbuf_sizep); Safefree(temp); @@ -4057,7 +4090,12 @@ S_my_langinfo_i(pTHX_ * evidence that it should work differently, this returns the alt-0 * string for ALT_DIGITS. */ if (! return_format) { - return retval; + return retval; /* *utf8ness was set by my_strftime8() */ + } + + /* A format is always in ASCII */ + if (utf8ness) { + *utf8ness = 1; } /* If to return the format, not the value, overwrite the buffer @@ -5817,7 +5855,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) iswalnum() etc. due to their not handling 80-FF correctly */ const char * scratch_buffer = NULL; const char *codeset = my_langinfo_c(CODESET, LC_CTYPE, NULL, - &scratch_buffer, NULL); + &scratch_buffer, NULL, NULL); DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\tnllanginfo returned CODESET '%s'\n", codeset)); @@ -5890,7 +5928,7 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) const U8 * currency_string = (const U8 *) my_langinfo_c(CRNCYSTR, LC_MONETARY, save_input_locale, - &scratch_buffer, NULL); + &scratch_buffer, NULL, NULL); /* 2nd param not relevant for this item */ const U8 * first_variant; @@ -6294,7 +6332,7 @@ S_is_locale_utf8(pTHX_ const char * locale) const char * scratch_buffer = NULL; const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale, - &scratch_buffer, NULL); + &scratch_buffer, NULL, NULL); bool retval = is_codeset_name_UTF8(codeset); PERL_ARGS_ASSERT_IS_LOCALE_UTF8; diff --git a/proto.h b/proto.h index 3be7664851da..c361c3858932 100644 --- a/proto.h +++ b/proto.h @@ -4236,7 +4236,7 @@ STATIC HV * S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf, const # endif # if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -STATIC const char* S_my_langinfo_i(pTHX_ const int item, const unsigned int cat_index, const char * locale, const char ** retbufp, Size_t * retbuf_sizep); +STATIC const char* S_my_langinfo_i(pTHX_ const int item, const unsigned int cat_index, const char * locale, const char ** retbufp, Size_t * retbuf_sizep, int * utf8ness); #define PERL_ARGS_ASSERT_MY_LANGINFO_I \ assert(retbufp) # endif @@ -4731,7 +4731,7 @@ PERL_CALLCONV const char* Perl_langinfo(const nl_item item); #if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L) # if defined(PERL_IN_LOCALE_C) # if defined(USE_LOCALE) -STATIC const char* S_my_langinfo_i(pTHX_ const nl_item item, const unsigned int cat_index, const char * locale, const char ** retbufp, Size_t * retbuf_sizep); +STATIC const char* S_my_langinfo_i(pTHX_ const nl_item item, const unsigned int cat_index, const char * locale, const char ** retbufp, Size_t * retbuf_sizep, int * utf8ness); #define PERL_ARGS_ASSERT_MY_LANGINFO_I \ assert(retbufp) # endif