Skip to content

Commit

Permalink
locale.c: Add fcn for UTF8ness determination
Browse files Browse the repository at this point in the history
get_locale_string_utf8ness_i() will determine if the string it is passed
in the locale it is passed is to be treated as UTF-8, or not.
  • Loading branch information
khwilliamson committed Apr 29, 2021
1 parent 5515bbd commit 33556a3
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 0 deletions.
5 changes: 5 additions & 0 deletions embed.fnc
Expand Up @@ -3219,6 +3219,11 @@ ST |const char*|category_name |const int category
ST |unsigned int|get_category_index|const int category|NULLOK const char * locale
S |const char*|switch_category_locale_to_template|const int switch_category|const int template_category|NULLOK const char * template_locale
S |void |restore_switched_locale|const int category|NULLOK const char * const original_locale
S |unsigned|get_locale_string_utf8ness_i \
|NULLOK const char * locale \
|const unsigned cat_index \
|NULLOK const char * string \
|const int known_utf8
# if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L)
S |const char*|my_langinfo_i|const nl_item item \
|const unsigned int cat_index \
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -1712,6 +1712,7 @@
# if defined(USE_LOCALE)
#define category_name S_category_name
#define get_category_index S_get_category_index
#define get_locale_string_utf8ness_i(a,b,c,d) S_get_locale_string_utf8ness_i(aTHX_ a,b,c,d)
#define is_codeset_name_UTF8 S_is_codeset_name_UTF8
#define is_locale_utf8(a) S_is_locale_utf8(aTHX_ a)
#define new_LC_ALL(a) S_new_LC_ALL(aTHX_ a)
Expand Down
105 changes: 105 additions & 0 deletions locale.c
Expand Up @@ -2730,8 +2730,113 @@ S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
return *buf;
}

STATIC unsigned
S_get_locale_string_utf8ness_i(pTHX_ const char * locale,
const unsigned cat_index,
const char * string,
const int known_utf8)
{
/* Return to indicate if 'string' in the locale given by the input
* arguments should be considered UTF-8 or not.
* 0 = definitely not
* 1 = immaterial, representation is the same in UTF-8 as not
* 2 = defintely yes */

# define UTF8NESS_UNKNOWN -1 /* mnemonic value for 'known_utf8' */

/* input 'known_utf8' indicates if the locale is UTF-8 or not.
* 0 => no,
* 1 => yes,
* UTF8NESS_UNKNOWN => unknown
*
* If the input 'locale' is not NULL, use that for the locale; otherwise
* use the current locale for the category specified by 'cat_index'.
*/

Size_t len;
const U8 * first_variant = NULL;

PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
assert(cat_index <= NOMINAL_LC_ALL_INDEX);
assert(inRANGE(known_utf8, 0, 1) || known_utf8 == UTF8NESS_UNKNOWN);

if (string == NULL) {
return 0;
}

len = strlen(string);

if (IN_BYTES) { /* respect 'use bytes' */
return 0;
}

/* UTF8ness is immaterial if the representation doesn't vary */
if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
return 1;
}

/* Can't be UTF-8 if invalid */
if (! is_utf8_string((U8 *) first_variant,
len - ((char *) first_variant - string)))
{
return 0;
}

/* Here and below, we know the string is legal UTF-8, containing at least
* one character requiring a sequence of two or more bytes. It is quite
* likely to be UTF-8. But it pays to be paranoid and do further checking.
*
* If we already know the UTF-8ness of the locale, then we immediately know
* what the string is */
if (UNLIKELY(inRANGE(known_utf8, 0, 1))) {
return (known_utf8) ? 2 : 0;
}

# if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)

/* Here, we have available the libc functions that can be used to
* accurately deterimine the UTF8ness of the underlying locale. If it is a
* UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
* the string is legal UTF-8
*
* However, if the perl is compiled to not pay attention to the category
* being passed in, you might think that that locale is essentially always
* the C locale, so it would make sense to say it isn't UTF-8. But to get
* here, the string has to contain characters unknown in the C locale. And
* in fact, Windows boxes are compiled without LC_MESSAGES, as their
* message catalog isn't really a part of the locale system. But those
* messages really could be UTF-8, and given that the odds are rather small
* of something not being UTF-8 but being syntactically valid UTF-8, khw
* has decided to call such strings as UTF-8. */

if (locale == NULL) {
locale = querylocale_i(cat_index);
}
if (is_locale_utf8(locale)) {
return 2;
}

return 0;

# else

/* Here, we have a valid UTF-8 string containing non-ASCII characters, and
* don't have access to functions to check if the locale is UTF-8 or not.
* Assume that it is. khw tried adding a check that the string is entirely
* in a single Unicode script, but discovered the strftime() timezone is
* user-settable through the environment, which may be in a different
* script than the locale-expected value. */
PERL_UNUSED_ARG(locale);
PERL_UNUSED_ARG(cat_index);

return 2;

#endif

}

#endif /* USE_LOCALE */

int
Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
{
Expand Down
2 changes: 2 additions & 0 deletions proto.h
Expand Up @@ -5133,6 +5133,8 @@ STATIC const char* S_category_name(const int category);
#define PERL_ARGS_ASSERT_CATEGORY_NAME
STATIC unsigned int S_get_category_index(const int category, const char * locale);
#define PERL_ARGS_ASSERT_GET_CATEGORY_INDEX
STATIC unsigned S_get_locale_string_utf8ness_i(pTHX_ const char * locale, const unsigned cat_index, const char * string, const int known_utf8);
#define PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I
STATIC bool S_is_codeset_name_UTF8(const char * name);
#define PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8 \
assert(name)
Expand Down

0 comments on commit 33556a3

Please sign in to comment.