From f2e9cc2c765ba1758a11cba3783d2f4a45ee19d7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 2 Mar 2021 04:29:08 -0700 Subject: [PATCH] Move utf8ness calc for $! into locale.c from mg.c locale.c has the infrastructure to handle this, so remove repeated logic. The removed code tried to discern better based on using script runs, but this actually doesn't help, so is removed. --- embed.fnc | 5 +++- embed.h | 5 +++- locale.c | 77 +++++++++++++++++++++++++++++++++++++++++++----------- makedef.pl | 2 +- mg.c | 54 +++++++++++++++----------------------- proto.h | 7 +++-- 6 files changed, 97 insertions(+), 53 deletions(-) diff --git a/embed.fnc b/embed.fnc index da7bcf296443..594b679661b7 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1606,10 +1606,13 @@ ATdo |const char*|Perl_langinfo8|const nl_item item|NULLOK int * utf8ness ATdo |const char*|Perl_langinfo|const int item ATdo |const char*|Perl_langinfo8|const int item|NULLOK int * utf8ness #endif +#ifdef WIN32 +p |bool |get_win32_message_utf8ness|NULLOK const char * string +#endif pEX |int |mbtowc_|NULLOK const wchar_t * pwc|NULLOK const char * s|const Size_t len CpO |int |init_i18nl10n |int printwarn CbpOD |int |init_i18nl14n |int printwarn -p |char* |my_strerror |const int errnum +p |char* |my_strerror |const int errnum|NN int * utf8ness XpT |void |_warn_problematic_locale Xp |void |set_numeric_underlying Xp |void |set_numeric_standard diff --git a/embed.h b/embed.h index a5646a7b9a2f..39004ad82935 100644 --- a/embed.h +++ b/embed.h @@ -1411,7 +1411,7 @@ #define my_clearenv() Perl_my_clearenv(aTHX) #define my_lstat_flags(a) Perl_my_lstat_flags(aTHX_ a) #define my_stat_flags(a) Perl_my_stat_flags(aTHX_ a) -#define my_strerror(a) Perl_my_strerror(aTHX_ a) +#define my_strerror(a,b) Perl_my_strerror(aTHX_ a,b) #define my_unexec() Perl_my_unexec(aTHX) #define newATTRSUB_x(a,b,c,d,e,f) Perl_newATTRSUB_x(aTHX_ a,b,c,d,e,f) #define newSTUB(a,b) Perl_newSTUB(aTHX_ a,b) @@ -2085,6 +2085,9 @@ #define quadmath_format_needed Perl_quadmath_format_needed #define quadmath_format_valid Perl_quadmath_format_valid # endif +# if defined(WIN32) +#define get_win32_message_utf8ness(a) Perl_get_win32_message_utf8ness(aTHX_ a) +# endif # if defined(_MSC_VER) #define magic_regdatum_set(a,b) Perl_magic_regdatum_set(aTHX_ a,b) # endif diff --git a/locale.c b/locale.c index 571a3e590188..9f092d067d4d 100644 --- a/locale.c +++ b/locale.c @@ -2828,12 +2828,30 @@ S_get_locale_string_utf8ness_i(pTHX_ const char * locale, return 2; -#endif +# endif } +# ifdef WIN32 + +bool +Perl_get_win32_message_utf8ness(pTHX_ const char * string) +{ + /* This function knows the internal workings of + * get_locale_string_utf8ness_i() so that: + * + * NULL => locale irrelevant + * 0 => category irrelevant + * TRUE => assume the locale is UTF-8, so returns based on the legality of + * the input string, ignoring the locale and category completely */ + + return get_locale_string_utf8ness_i(NULL, 0, string, TRUE); +} + +# endif #endif /* USE_LOCALE */ + int Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len) { @@ -6533,10 +6551,11 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) " Within locale scope=%d\n", \ __FILE__, __LINE__, errnum, in_locale)) -#define DEBUG_STRERROR_RETURN(errstr) \ +#define DEBUG_STRERROR_RETURN(errstr, utf8ness) \ DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ "%s:%d Strerror returned; saving a copy: '", __FILE__, __LINE__); \ - print_bytes_for_locale(errstr, errstr + strlen(errstr), 0)); + print_bytes_for_locale(errstr, errstr + strlen(errstr), 0); \ + PerlIO_printf(Perl_debug_log, "'; utf8ness=%d\n", *utf8ness)); /* On platforms that have precisely one of these categories (Windows * qualifies), these yield the correct one */ @@ -6553,6 +6572,11 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) * returns. Otherwise the text is derived from the locale, LC_MESSAGES if we * have that; LC_CTYPE if not. * + * It returns in *utf8ness the result's UTF-8ness: + * 0 = definitely not + * 1 = immaterial: representation is the same in UTF-8 as not + * 2 = defintely yes + * * The function just calls strerror(), but temporarily switches locales, if * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same * CODESET in order for the return from strerror() to not contain '?' symbols, @@ -6581,12 +6605,15 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) /* Here, neither category is defined: use the C locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, 0); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = 1; + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6601,7 +6628,7 @@ Perl_my_strerror(pTHX_ const int errnum) * locale; otherwise use the current locale object */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; @@ -6615,7 +6642,10 @@ Perl_my_strerror(pTHX_ const int errnum) DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); errstr = savepv(strerror_l(errnum, which_obj)); - DEBUG_STRERROR_RETURN(errstr); + + *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr, + UTF8NESS_UNKNOWN); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6627,14 +6657,17 @@ Perl_my_strerror(pTHX_ const int errnum) * either C or the LC_MESSAGES locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */ errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + *utf8ness = 1; } else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE matches */ @@ -6642,10 +6675,12 @@ Perl_my_strerror(pTHX_ const int errnum) cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur); errstr = savepv(strerror_l(errnum, cur)); + *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, + errstr, UTF8NESS_UNKNOWN); freelocale(cur); } - DEBUG_STRERROR_RETURN(errstr); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6659,15 +6694,19 @@ Perl_my_strerror(pTHX_ const int errnum) * strerror */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, 0); errstr = savepv(Strerror(errnum)); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = 1; + + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6681,7 +6720,7 @@ Perl_my_strerror(pTHX_ const int errnum) * locale; otherwise use the current locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; @@ -6691,6 +6730,8 @@ Perl_my_strerror(pTHX_ const int errnum) if (IN_LC(categories[WHICH_LC_INDEX])) { errstr = savepv(Strerror(errnum)); + *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr, + UTF8NESS_UNKNOWN); } else { const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C"); @@ -6698,9 +6739,11 @@ Perl_my_strerror(pTHX_ const int errnum) errstr = savepv(Strerror(errnum)); restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale); + + *utf8ness = 1; } - DEBUG_STRERROR_RETURN(errstr); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6713,7 +6756,7 @@ Perl_my_strerror(pTHX_ const int errnum) * either C or the LC_MESSAGES locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; const char * desired_locale = savepv((IN_LC(LC_MESSAGES)) @@ -6723,6 +6766,8 @@ Perl_my_strerror(pTHX_ const int errnum) const char * orig_MESSAGES_locale; /* XXX Can fail on z/OS */ + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, desired_locale); @@ -6733,7 +6778,9 @@ Perl_my_strerror(pTHX_ const int errnum) restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale); restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, errstr, + UTF8NESS_UNKNOWN); + DEBUG_STRERROR_RETURN(errstr, utf8ness); Safefree(desired_locale); SAVEFREEPV(errstr); diff --git a/makedef.pl b/makedef.pl index 9abacac179f5..07ed77a56595 100644 --- a/makedef.pl +++ b/makedef.pl @@ -271,7 +271,7 @@ sub readvar { if ($ARGS{PLATFORM} ne 'win32') { ++$skip{$_} foreach qw( - Perl_my_setlocale + Perl_get_win32_message_utf8ness ); } diff --git a/mg.c b/mg.c index 3355df1b4b64..40a692a8624b 100644 --- a/mg.c +++ b/mg.c @@ -808,37 +808,6 @@ S_fixup_errno_string(pTHX_ SV* sv) if(strEQ(SvPVX(sv), "")) { sv_catpv(sv, UNKNOWN_ERRNO_MSG); } - else { - - /* In some locales the error string may come back as UTF-8, in which - * case we should turn on that flag. This didn't use to happen, and to - * avoid as many possible backward compatibility issues as possible, we - * don't turn on the flag unless we have to. So the flag stays off for - * an entirely invariant string. We assume that if the string looks - * like UTF-8 in a single script, it really is UTF-8: "text in any - * other encoding that uses bytes with the high bit set is extremely - * unlikely to pass a UTF-8 validity test" - * (http://en.wikipedia.org/wiki/Charset_detection). There is a - * potential that we will get it wrong however, especially on short - * error message text, so do an additional check. */ - if ( ! IN_BYTES /* respect 'use bytes' */ - && is_utf8_non_invariant_string((U8*) SvPVX_const(sv), SvCUR(sv)) - -#ifdef USE_LOCALE_MESSAGES - - && _is_cur_LC_category_utf8(LC_MESSAGES) - -#else /* If can't check directly, at least can see if script is consistent, - under UTF-8, which gives us an extra measure of confidence. */ - - && isSCRIPT_RUN((const U8 *) SvPVX_const(sv), (U8 *) SvEND(sv), - TRUE) /* Means assume UTF-8 */ -#endif - - ) { - SvUTF8_on(sv); - } - } } /* @@ -876,11 +845,16 @@ SV * Perl_sv_string_from_errnum(pTHX_ int errnum, SV *tgtsv) { char const *errstr; + int utf8ness; + if(!tgtsv) tgtsv = sv_newmortal(); - errstr = my_strerror(errnum); + errstr = my_strerror(errnum, &utf8ness); if(errstr) { sv_setpv(tgtsv, errstr); + if (utf8ness > 1) { + SvUTF8_on(tgtsv); + } fixup_errno_string(tgtsv); } else { SvPVCLEAR(tgtsv); @@ -955,9 +929,11 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) SvPVCLEAR(sv); } #elif defined(OS2) + { + int utf8ness; if (!(_emx_env & 0x200)) { /* Under DOS */ sv_setnv(sv, (NV)errno); - sv_setpv(sv, errno ? my_strerror(errno) : ""); + sv_setpv(sv, errno ? my_strerror(errnum, &utf8ness) : ""); } else { if (errno != errno_isOS2) { const int tmp = _syserrno(); @@ -968,8 +944,12 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) sv_setpv(sv, os2error(Perl_rc)); } if (SvOK(sv) && strNE(SvPVX(sv), "")) { + if (utf8ness > 1) { + SvUTF8_on(sv); + } fixup_errno_string(sv); } + } # elif defined(WIN32) { const DWORD dwErr = GetLastError(); @@ -977,6 +957,14 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) if (dwErr) { PerlProc_GetOSError(sv, dwErr); fixup_errno_string(sv); + +# ifdef USE_LOCALE + if ( IN_LOCALE + && get_win32_message_utf8ness(SvPV_nomg_const_nolen(sv))) + { + SvUTF8_on(sv); + } +# endif } else SvPVCLEAR(sv); diff --git a/proto.h b/proto.h index 95713010d115..43a29cd0bc64 100644 --- a/proto.h +++ b/proto.h @@ -2195,8 +2195,9 @@ PERL_CALLCONV int Perl_my_socketpair(int family, int type, int protocol, int fd[ #define PERL_ARGS_ASSERT_MY_STAT PERL_CALLCONV I32 Perl_my_stat_flags(pTHX_ const U32 flags); #define PERL_ARGS_ASSERT_MY_STAT_FLAGS -PERL_CALLCONV char* Perl_my_strerror(pTHX_ const int errnum); -#define PERL_ARGS_ASSERT_MY_STRERROR +PERL_CALLCONV char* Perl_my_strerror(pTHX_ const int errnum, int * utf8ness); +#define PERL_ARGS_ASSERT_MY_STRERROR \ + assert(utf8ness) PERL_CALLCONV char * Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst) __attribute__format__(__strftime__,pTHX_1,0); #define PERL_ARGS_ASSERT_MY_STRFTIME \ @@ -6997,6 +6998,8 @@ PERL_CALLCONV bool Perl_quadmath_format_valid(const char* format); assert(format) #endif #if defined(WIN32) +PERL_CALLCONV bool Perl_get_win32_message_utf8ness(pTHX_ const char * string); +#define PERL_ARGS_ASSERT_GET_WIN32_MESSAGE_UTF8NESS PERL_CALLCONV_NO_RET void win32_croak_not_implemented(const char * fname) __attribute__noreturn__; #define PERL_ARGS_ASSERT_WIN32_CROAK_NOT_IMPLEMENTED \