Skip to content

Commit

Permalink
locale.c: Add is_locale_utf8()
Browse files Browse the repository at this point in the history
Previous commits have added the infrastructure to be able to determine
if a locale is UTF-8.  This will prove useful, and this commit adds
a function to encapsulate this information, and uses it in a couple of
places, with more to come in future commits.

This uses as a final fallback, mbtowc(), supposed to be available in
C99.  Future commits will add heuristics when that function isn't
available or is known to be unreliable on a particular system.
  • Loading branch information
khwilliamson committed Aug 10, 2022
1 parent cc41e95 commit 1353a7a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 3 deletions.
1 change: 1 addition & 0 deletions embed.fnc
Expand Up @@ -3334,6 +3334,7 @@ So |const char *|toggle_locale_i|const unsigned switch_cat_index \
So |void |restore_toggled_locale_i|const unsigned cat_index \
|NULLOK const char * original_locale \
|const line_t caller_line
S |bool |is_locale_utf8 |NN const char * locale
# if defined(USE_POSIX_2008_LOCALE)
S |const char*|emulate_setlocale_i|const unsigned int index \
|NULLOK const char* new_locale \
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -1706,6 +1706,7 @@
#define category_name S_category_name
#define get_category_index S_get_category_index
#define is_codeset_name_UTF8 S_is_codeset_name_UTF8
#define is_locale_utf8(a) S_is_locale_utf8(aTHX_ a)
#define new_LC_ALL(a) S_new_LC_ALL(aTHX_ a)
#define new_collate(a) S_new_collate(aTHX_ a)
#define new_ctype(a) S_new_ctype(aTHX_ a)
Expand Down
38 changes: 35 additions & 3 deletions locale.c
Expand Up @@ -1846,7 +1846,7 @@ S_new_ctype(pTHX_ const char *newctype)
PL_warn_locale = NULL;
}

PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);

/* A UTF-8 locale gets standard rules. But note that code still has to
* handle this specially because of the three problematic code points */
Expand Down Expand Up @@ -2213,7 +2213,7 @@ S_new_collate(pTHX_ const char *newcoll)
goto is_standard_collation;
}

PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
PL_in_utf8_COLLATE_locale = is_locale_utf8(newcoll);
PL_strxfrm_NUL_replacement = '\0';
PL_strxfrm_max_cp = 0;

Expand Down Expand Up @@ -5770,7 +5770,39 @@ S_is_codeset_name_UTF8(const char * name)
&& (len == 4 || name[3] == '-'));
}

#endif
STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)
{
/* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
* my_langinfo() */

# if ! defined(USE_LOCALE_CTYPE) \
|| defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */

PERL_UNUSED_ARG(locale);

return FALSE;

# else

const char * scratch_buffer = NULL;
const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
&scratch_buffer, NULL);
bool retval = is_codeset_name_UTF8(codeset);

PERL_ARGS_ASSERT_IS_LOCALE_UTF8;

DEBUG_Lv(PerlIO_printf(Perl_debug_log,
"found codeset=%s, is_utf8=%d\n", codeset, retval));

Safefree(scratch_buffer);
return retval;

# endif

}

#endif /* USE_LOCALE */

bool
Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
Expand Down
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -5635,6 +5635,9 @@ STATIC unsigned int S_get_category_index(const int category, const char * locale
STATIC bool S_is_codeset_name_UTF8(const char * name);
#define PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8 \
assert(name)
STATIC bool S_is_locale_utf8(pTHX_ const char * locale);
#define PERL_ARGS_ASSERT_IS_LOCALE_UTF8 \
assert(locale)
STATIC void S_new_LC_ALL(pTHX_ const char* unused);
#define PERL_ARGS_ASSERT_NEW_LC_ALL
STATIC void S_new_collate(pTHX_ const char* newcoll);
Expand Down

0 comments on commit 1353a7a

Please sign in to comment.