locale.c: Add fcn for UTF8ness determination

get_locale_string_utf8ness_i() will determine if the string it is passed in the locale it is passed is to be treated as UTF-8, or not.
Perl · Apr 29, 2021 · 33556a3 · 33556a3
1 parent 5515bbd
commit 33556a3
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 0 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -3219,6 +3219,11 @@ ST	|const char*|category_name |const int category
 ST	|unsigned int|get_category_index|const int category|NULLOK const char * locale
 S	|const char*|switch_category_locale_to_template|const int switch_category|const int template_category|NULLOK const char * template_locale
 S	|void	|restore_switched_locale|const int category|NULLOK const char * const original_locale
+S	|unsigned|get_locale_string_utf8ness_i				\
+				|NULLOK const char * locale		\
+				|const unsigned cat_index		\
+				|NULLOK const char * string		\
+				|const int known_utf8
 #    if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L)
 S	|const char*|my_langinfo_i|const nl_item item			\
 				|const unsigned int cat_index		\

diff --git a/embed.h b/embed.h
@@ -1712,6 +1712,7 @@
 #    if defined(USE_LOCALE)
 #define category_name		S_category_name
 #define get_category_index	S_get_category_index
+#define get_locale_string_utf8ness_i(a,b,c,d)	S_get_locale_string_utf8ness_i(aTHX_ a,b,c,d)
 #define is_codeset_name_UTF8	S_is_codeset_name_UTF8
 #define is_locale_utf8(a)	S_is_locale_utf8(aTHX_ a)
 #define new_LC_ALL(a)		S_new_LC_ALL(aTHX_ a)

diff --git a/locale.c b/locale.c
@@ -2730,8 +2730,113 @@ S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
     return *buf;
 }
 
+STATIC unsigned
+S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
+                                     const unsigned cat_index,
+                                     const char * string,
+                                     const int known_utf8)
+{
+    /* Return to indicate if 'string' in the locale given by the input
+     * arguments should be considered UTF-8 or not.
+     *      0 = definitely not
+     *      1 = immaterial, representation is the same in UTF-8 as not
+     *      2 = defintely yes */
+
+#  define UTF8NESS_UNKNOWN  -1 /* mnemonic value for 'known_utf8' */
+
+    /* input 'known_utf8' indicates if the locale is UTF-8 or not.
+     *      0 => no,
+     *      1 => yes,
+     *      UTF8NESS_UNKNOWN => unknown
+     *
+     * If the input 'locale' is not NULL, use that for the locale; otherwise
+     * use the current locale for the category specified by 'cat_index'.
+     */
+
+    Size_t len;
+    const U8 * first_variant = NULL;
+
+    PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
+    assert(cat_index <= NOMINAL_LC_ALL_INDEX);
+    assert(inRANGE(known_utf8, 0, 1) || known_utf8 == UTF8NESS_UNKNOWN);
+
+    if (string == NULL) {
+        return 0;
+    }
+
+    len = strlen(string);
+
+    if (IN_BYTES) { /* respect 'use bytes' */
+        return 0;
+    }
+
+    /* UTF8ness is immaterial if the representation doesn't vary */
+    if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
+        return 1;
+    }
+
+    /* Can't be UTF-8 if invalid */
+    if (! is_utf8_string((U8 *) first_variant,
+                         len - ((char *) first_variant - string)))
+    {
+        return 0;
+    }
+
+    /* Here and below, we know the string is legal UTF-8, containing at least
+     * one character requiring a sequence of two or more bytes.  It is quite
+     * likely to be UTF-8.  But it pays to be paranoid and do further checking.
+     *
+     * If we already know the UTF-8ness of the locale, then we immediately know
+     * what the string is */
+    if (UNLIKELY(inRANGE(known_utf8, 0, 1))) {
+        return (known_utf8) ? 2 : 0;
+    }
+
+#  if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+
+    /* Here, we have available the libc functions that can be used to
+     * accurately deterimine the UTF8ness of the underlying locale.  If it is a
+     * UTF-8 locale, the string is UTF-8;  otherwise it was coincidental that
+     * the string is legal UTF-8
+     *
+     * However, if the perl is compiled to not pay attention to the category
+     * being passed in, you might think that that locale is essentially always
+     * the C locale, so it would make sense to say it isn't UTF-8.  But to get
+     * here, the string has to contain characters unknown in the C locale.  And
+     * in fact, Windows boxes are compiled without LC_MESSAGES, as their
+     * message catalog isn't really a part of the locale system.  But those
+     * messages really could be UTF-8, and given that the odds are rather small
+     * of something not being UTF-8 but being syntactically valid UTF-8, khw
+     * has decided to call such strings as UTF-8. */
+
+    if (locale == NULL) {
+        locale = querylocale_i(cat_index);
+    }
+    if (is_locale_utf8(locale)) {
+        return 2;
+    }
+
+    return 0;
+
+#  else
+
+    /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
+     * don't have access to functions to check if the locale is UTF-8 or not.
+     * Assume that it is.  khw tried adding a check that the string is entirely
+     * in a single Unicode script, but discovered the strftime() timezone is
+     * user-settable through the environment, which may be in a different
+     * script than the locale-expected value. */
+    PERL_UNUSED_ARG(locale);
+    PERL_UNUSED_ARG(cat_index);
+
+    return 2;
+
 #endif
 
+}
+
+#endif  /* USE_LOCALE */
+
 int
 Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
 {

diff --git a/proto.h b/proto.h
@@ -5133,6 +5133,8 @@ STATIC const char*	S_category_name(const int category);
 #define PERL_ARGS_ASSERT_CATEGORY_NAME
 STATIC unsigned int	S_get_category_index(const int category, const char * locale);
 #define PERL_ARGS_ASSERT_GET_CATEGORY_INDEX
+STATIC unsigned	S_get_locale_string_utf8ness_i(pTHX_ const char * locale, const unsigned cat_index, const char * string, const int known_utf8);
+#define PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I
 STATIC bool	S_is_codeset_name_UTF8(const char * name);
 #define PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8	\
 	assert(name)