diff --git a/embed.fnc b/embed.fnc index e02f41a6ac2f..4c5a5d576b5b 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3219,6 +3219,11 @@ ST |const char*|category_name |const int category ST |unsigned int|get_category_index|const int category|NULLOK const char * locale S |const char*|switch_category_locale_to_template|const int switch_category|const int template_category|NULLOK const char * template_locale S |void |restore_switched_locale|const int category|NULLOK const char * const original_locale +S |unsigned|get_locale_string_utf8ness_i \ + |NULLOK const char * locale \ + |const unsigned cat_index \ + |NULLOK const char * string \ + |const int known_utf8 # if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L) S |const char*|my_langinfo_i|const nl_item item \ |const unsigned int cat_index \ diff --git a/embed.h b/embed.h index bfa375cd3782..403b1351bfb5 100644 --- a/embed.h +++ b/embed.h @@ -1712,6 +1712,7 @@ # if defined(USE_LOCALE) #define category_name S_category_name #define get_category_index S_get_category_index +#define get_locale_string_utf8ness_i(a,b,c,d) S_get_locale_string_utf8ness_i(aTHX_ a,b,c,d) #define is_codeset_name_UTF8 S_is_codeset_name_UTF8 #define is_locale_utf8(a) S_is_locale_utf8(aTHX_ a) #define new_LC_ALL(a) S_new_LC_ALL(aTHX_ a) diff --git a/locale.c b/locale.c index f491976a8b97..3032a10d8768 100644 --- a/locale.c +++ b/locale.c @@ -2730,8 +2730,113 @@ S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size) return *buf; } +STATIC unsigned +S_get_locale_string_utf8ness_i(pTHX_ const char * locale, + const unsigned cat_index, + const char * string, + const int known_utf8) +{ + /* Return to indicate if 'string' in the locale given by the input + * arguments should be considered UTF-8 or not. + * 0 = definitely not + * 1 = immaterial, representation is the same in UTF-8 as not + * 2 = defintely yes */ + +# define UTF8NESS_UNKNOWN -1 /* mnemonic value for 'known_utf8' */ + + /* input 'known_utf8' indicates if the locale is UTF-8 or not. + * 0 => no, + * 1 => yes, + * UTF8NESS_UNKNOWN => unknown + * + * If the input 'locale' is not NULL, use that for the locale; otherwise + * use the current locale for the category specified by 'cat_index'. + */ + + Size_t len; + const U8 * first_variant = NULL; + + PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I; + assert(cat_index <= NOMINAL_LC_ALL_INDEX); + assert(inRANGE(known_utf8, 0, 1) || known_utf8 == UTF8NESS_UNKNOWN); + + if (string == NULL) { + return 0; + } + + len = strlen(string); + + if (IN_BYTES) { /* respect 'use bytes' */ + return 0; + } + + /* UTF8ness is immaterial if the representation doesn't vary */ + if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) { + return 1; + } + + /* Can't be UTF-8 if invalid */ + if (! is_utf8_string((U8 *) first_variant, + len - ((char *) first_variant - string))) + { + return 0; + } + + /* Here and below, we know the string is legal UTF-8, containing at least + * one character requiring a sequence of two or more bytes. It is quite + * likely to be UTF-8. But it pays to be paranoid and do further checking. + * + * If we already know the UTF-8ness of the locale, then we immediately know + * what the string is */ + if (UNLIKELY(inRANGE(known_utf8, 0, 1))) { + return (known_utf8) ? 2 : 0; + } + +# if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + + /* Here, we have available the libc functions that can be used to + * accurately deterimine the UTF8ness of the underlying locale. If it is a + * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that + * the string is legal UTF-8 + * + * However, if the perl is compiled to not pay attention to the category + * being passed in, you might think that that locale is essentially always + * the C locale, so it would make sense to say it isn't UTF-8. But to get + * here, the string has to contain characters unknown in the C locale. And + * in fact, Windows boxes are compiled without LC_MESSAGES, as their + * message catalog isn't really a part of the locale system. But those + * messages really could be UTF-8, and given that the odds are rather small + * of something not being UTF-8 but being syntactically valid UTF-8, khw + * has decided to call such strings as UTF-8. */ + + if (locale == NULL) { + locale = querylocale_i(cat_index); + } + if (is_locale_utf8(locale)) { + return 2; + } + + return 0; + +# else + + /* Here, we have a valid UTF-8 string containing non-ASCII characters, and + * don't have access to functions to check if the locale is UTF-8 or not. + * Assume that it is. khw tried adding a check that the string is entirely + * in a single Unicode script, but discovered the strftime() timezone is + * user-settable through the environment, which may be in a different + * script than the locale-expected value. */ + PERL_UNUSED_ARG(locale); + PERL_UNUSED_ARG(cat_index); + + return 2; + #endif +} + +#endif /* USE_LOCALE */ + int Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len) { diff --git a/proto.h b/proto.h index d36761946822..12f0d63bf7d7 100644 --- a/proto.h +++ b/proto.h @@ -5133,6 +5133,8 @@ STATIC const char* S_category_name(const int category); #define PERL_ARGS_ASSERT_CATEGORY_NAME STATIC unsigned int S_get_category_index(const int category, const char * locale); #define PERL_ARGS_ASSERT_GET_CATEGORY_INDEX +STATIC unsigned S_get_locale_string_utf8ness_i(pTHX_ const char * locale, const unsigned cat_index, const char * string, const int known_utf8); +#define PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I STATIC bool S_is_codeset_name_UTF8(const char * name); #define PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8 \ assert(name)