diff --git a/embed.fnc b/embed.fnc index da7bcf296443..594b679661b7 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1606,10 +1606,13 @@ ATdo |const char*|Perl_langinfo8|const nl_item item|NULLOK int * utf8ness ATdo |const char*|Perl_langinfo|const int item ATdo |const char*|Perl_langinfo8|const int item|NULLOK int * utf8ness #endif +#ifdef WIN32 +p |bool |get_win32_message_utf8ness|NULLOK const char * string +#endif pEX |int |mbtowc_|NULLOK const wchar_t * pwc|NULLOK const char * s|const Size_t len CpO |int |init_i18nl10n |int printwarn CbpOD |int |init_i18nl14n |int printwarn -p |char* |my_strerror |const int errnum +p |char* |my_strerror |const int errnum|NN int * utf8ness XpT |void |_warn_problematic_locale Xp |void |set_numeric_underlying Xp |void |set_numeric_standard diff --git a/embed.h b/embed.h index a5646a7b9a2f..39004ad82935 100644 --- a/embed.h +++ b/embed.h @@ -1411,7 +1411,7 @@ #define my_clearenv() Perl_my_clearenv(aTHX) #define my_lstat_flags(a) Perl_my_lstat_flags(aTHX_ a) #define my_stat_flags(a) Perl_my_stat_flags(aTHX_ a) -#define my_strerror(a) Perl_my_strerror(aTHX_ a) +#define my_strerror(a,b) Perl_my_strerror(aTHX_ a,b) #define my_unexec() Perl_my_unexec(aTHX) #define newATTRSUB_x(a,b,c,d,e,f) Perl_newATTRSUB_x(aTHX_ a,b,c,d,e,f) #define newSTUB(a,b) Perl_newSTUB(aTHX_ a,b) @@ -2085,6 +2085,9 @@ #define quadmath_format_needed Perl_quadmath_format_needed #define quadmath_format_valid Perl_quadmath_format_valid # endif +# if defined(WIN32) +#define get_win32_message_utf8ness(a) Perl_get_win32_message_utf8ness(aTHX_ a) +# endif # if defined(_MSC_VER) #define magic_regdatum_set(a,b) Perl_magic_regdatum_set(aTHX_ a,b) # endif diff --git a/locale.c b/locale.c index 571a3e590188..9f092d067d4d 100644 --- a/locale.c +++ b/locale.c @@ -2828,12 +2828,30 @@ S_get_locale_string_utf8ness_i(pTHX_ const char * locale, return 2; -#endif +# endif } +# ifdef WIN32 + +bool +Perl_get_win32_message_utf8ness(pTHX_ const char * string) +{ + /* This function knows the internal workings of + * get_locale_string_utf8ness_i() so that: + * + * NULL => locale irrelevant + * 0 => category irrelevant + * TRUE => assume the locale is UTF-8, so returns based on the legality of + * the input string, ignoring the locale and category completely */ + + return get_locale_string_utf8ness_i(NULL, 0, string, TRUE); +} + +# endif #endif /* USE_LOCALE */ + int Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len) { @@ -6533,10 +6551,11 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) " Within locale scope=%d\n", \ __FILE__, __LINE__, errnum, in_locale)) -#define DEBUG_STRERROR_RETURN(errstr) \ +#define DEBUG_STRERROR_RETURN(errstr, utf8ness) \ DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ "%s:%d Strerror returned; saving a copy: '", __FILE__, __LINE__); \ - print_bytes_for_locale(errstr, errstr + strlen(errstr), 0)); + print_bytes_for_locale(errstr, errstr + strlen(errstr), 0); \ + PerlIO_printf(Perl_debug_log, "'; utf8ness=%d\n", *utf8ness)); /* On platforms that have precisely one of these categories (Windows * qualifies), these yield the correct one */ @@ -6553,6 +6572,11 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) * returns. Otherwise the text is derived from the locale, LC_MESSAGES if we * have that; LC_CTYPE if not. * + * It returns in *utf8ness the result's UTF-8ness: + * 0 = definitely not + * 1 = immaterial: representation is the same in UTF-8 as not + * 2 = defintely yes + * * The function just calls strerror(), but temporarily switches locales, if * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same * CODESET in order for the return from strerror() to not contain '?' symbols, @@ -6581,12 +6605,15 @@ Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) /* Here, neither category is defined: use the C locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, 0); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = 1; + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6601,7 +6628,7 @@ Perl_my_strerror(pTHX_ const int errnum) * locale; otherwise use the current locale object */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; @@ -6615,7 +6642,10 @@ Perl_my_strerror(pTHX_ const int errnum) DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); errstr = savepv(strerror_l(errnum, which_obj)); - DEBUG_STRERROR_RETURN(errstr); + + *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr, + UTF8NESS_UNKNOWN); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6627,14 +6657,17 @@ Perl_my_strerror(pTHX_ const int errnum) * either C or the LC_MESSAGES locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */ errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + *utf8ness = 1; } else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE matches */ @@ -6642,10 +6675,12 @@ Perl_my_strerror(pTHX_ const int errnum) cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur); errstr = savepv(strerror_l(errnum, cur)); + *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, + errstr, UTF8NESS_UNKNOWN); freelocale(cur); } - DEBUG_STRERROR_RETURN(errstr); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6659,15 +6694,19 @@ Perl_my_strerror(pTHX_ const int errnum) * strerror */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, 0); errstr = savepv(Strerror(errnum)); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = 1; + + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6681,7 +6720,7 @@ Perl_my_strerror(pTHX_ const int errnum) * locale; otherwise use the current locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; @@ -6691,6 +6730,8 @@ Perl_my_strerror(pTHX_ const int errnum) if (IN_LC(categories[WHICH_LC_INDEX])) { errstr = savepv(Strerror(errnum)); + *utf8ness = get_locale_string_utf8ness_i(NULL, WHICH_LC_INDEX, errstr, + UTF8NESS_UNKNOWN); } else { const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C"); @@ -6698,9 +6739,11 @@ Perl_my_strerror(pTHX_ const int errnum) errstr = savepv(Strerror(errnum)); restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale); + + *utf8ness = 1; } - DEBUG_STRERROR_RETURN(errstr); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; @@ -6713,7 +6756,7 @@ Perl_my_strerror(pTHX_ const int errnum) * either C or the LC_MESSAGES locale */ char * -Perl_my_strerror(pTHX_ const int errnum) +Perl_my_strerror(pTHX_ const int errnum, int * utf8ness) { char *errstr; const char * desired_locale = savepv((IN_LC(LC_MESSAGES)) @@ -6723,6 +6766,8 @@ Perl_my_strerror(pTHX_ const int errnum) const char * orig_MESSAGES_locale; /* XXX Can fail on z/OS */ + PERL_ARGS_ASSERT_MY_STRERROR; + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, desired_locale); @@ -6733,7 +6778,9 @@ Perl_my_strerror(pTHX_ const int errnum) restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale); restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); - DEBUG_STRERROR_RETURN(errstr); + *utf8ness = get_locale_string_utf8ness_i(NULL, LC_MESSAGES_INDEX_, errstr, + UTF8NESS_UNKNOWN); + DEBUG_STRERROR_RETURN(errstr, utf8ness); Safefree(desired_locale); SAVEFREEPV(errstr); diff --git a/makedef.pl b/makedef.pl index 9abacac179f5..07ed77a56595 100644 --- a/makedef.pl +++ b/makedef.pl @@ -271,7 +271,7 @@ sub readvar { if ($ARGS{PLATFORM} ne 'win32') { ++$skip{$_} foreach qw( - Perl_my_setlocale + Perl_get_win32_message_utf8ness ); } diff --git a/mg.c b/mg.c index 3355df1b4b64..40a692a8624b 100644 --- a/mg.c +++ b/mg.c @@ -808,37 +808,6 @@ S_fixup_errno_string(pTHX_ SV* sv) if(strEQ(SvPVX(sv), "")) { sv_catpv(sv, UNKNOWN_ERRNO_MSG); } - else { - - /* In some locales the error string may come back as UTF-8, in which - * case we should turn on that flag. This didn't use to happen, and to - * avoid as many possible backward compatibility issues as possible, we - * don't turn on the flag unless we have to. So the flag stays off for - * an entirely invariant string. We assume that if the string looks - * like UTF-8 in a single script, it really is UTF-8: "text in any - * other encoding that uses bytes with the high bit set is extremely - * unlikely to pass a UTF-8 validity test" - * (http://en.wikipedia.org/wiki/Charset_detection). There is a - * potential that we will get it wrong however, especially on short - * error message text, so do an additional check. */ - if ( ! IN_BYTES /* respect 'use bytes' */ - && is_utf8_non_invariant_string((U8*) SvPVX_const(sv), SvCUR(sv)) - -#ifdef USE_LOCALE_MESSAGES - - && _is_cur_LC_category_utf8(LC_MESSAGES) - -#else /* If can't check directly, at least can see if script is consistent, - under UTF-8, which gives us an extra measure of confidence. */ - - && isSCRIPT_RUN((const U8 *) SvPVX_const(sv), (U8 *) SvEND(sv), - TRUE) /* Means assume UTF-8 */ -#endif - - ) { - SvUTF8_on(sv); - } - } } /* @@ -876,11 +845,16 @@ SV * Perl_sv_string_from_errnum(pTHX_ int errnum, SV *tgtsv) { char const *errstr; + int utf8ness; + if(!tgtsv) tgtsv = sv_newmortal(); - errstr = my_strerror(errnum); + errstr = my_strerror(errnum, &utf8ness); if(errstr) { sv_setpv(tgtsv, errstr); + if (utf8ness > 1) { + SvUTF8_on(tgtsv); + } fixup_errno_string(tgtsv); } else { SvPVCLEAR(tgtsv); @@ -955,9 +929,11 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) SvPVCLEAR(sv); } #elif defined(OS2) + { + int utf8ness; if (!(_emx_env & 0x200)) { /* Under DOS */ sv_setnv(sv, (NV)errno); - sv_setpv(sv, errno ? my_strerror(errno) : ""); + sv_setpv(sv, errno ? my_strerror(errnum, &utf8ness) : ""); } else { if (errno != errno_isOS2) { const int tmp = _syserrno(); @@ -968,8 +944,12 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) sv_setpv(sv, os2error(Perl_rc)); } if (SvOK(sv) && strNE(SvPVX(sv), "")) { + if (utf8ness > 1) { + SvUTF8_on(sv); + } fixup_errno_string(sv); } + } # elif defined(WIN32) { const DWORD dwErr = GetLastError(); @@ -977,6 +957,14 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) if (dwErr) { PerlProc_GetOSError(sv, dwErr); fixup_errno_string(sv); + +# ifdef USE_LOCALE + if ( IN_LOCALE + && get_win32_message_utf8ness(SvPV_nomg_const_nolen(sv))) + { + SvUTF8_on(sv); + } +# endif } else SvPVCLEAR(sv); diff --git a/proto.h b/proto.h index 95713010d115..43a29cd0bc64 100644 --- a/proto.h +++ b/proto.h @@ -2195,8 +2195,9 @@ PERL_CALLCONV int Perl_my_socketpair(int family, int type, int protocol, int fd[ #define PERL_ARGS_ASSERT_MY_STAT PERL_CALLCONV I32 Perl_my_stat_flags(pTHX_ const U32 flags); #define PERL_ARGS_ASSERT_MY_STAT_FLAGS -PERL_CALLCONV char* Perl_my_strerror(pTHX_ const int errnum); -#define PERL_ARGS_ASSERT_MY_STRERROR +PERL_CALLCONV char* Perl_my_strerror(pTHX_ const int errnum, int * utf8ness); +#define PERL_ARGS_ASSERT_MY_STRERROR \ + assert(utf8ness) PERL_CALLCONV char * Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst) __attribute__format__(__strftime__,pTHX_1,0); #define PERL_ARGS_ASSERT_MY_STRFTIME \ @@ -6997,6 +6998,8 @@ PERL_CALLCONV bool Perl_quadmath_format_valid(const char* format); assert(format) #endif #if defined(WIN32) +PERL_CALLCONV bool Perl_get_win32_message_utf8ness(pTHX_ const char * string); +#define PERL_ARGS_ASSERT_GET_WIN32_MESSAGE_UTF8NESS PERL_CALLCONV_NO_RET void win32_croak_not_implemented(const char * fname) __attribute__noreturn__; #define PERL_ARGS_ASSERT_WIN32_CROAK_NOT_IMPLEMENTED \