Skip to content

Commit

Permalink
locale.c: Add is_locale_utf8()
Browse files Browse the repository at this point in the history
Previous commits have added the infrastructure to be able to determine
if a locale is UTF-8.  This will prove useful, and this commit adds
a function to encapsulate this information, and uses it in a couple of
places, with more to come in future commits.

This uses as a final fallback, mbtowc(), which some sources view was a
late adder to C89, and others as not really being available until C99.
Future commits will add heuristics when that function isn't available.
  • Loading branch information
khwilliamson committed May 5, 2021
1 parent 74acc90 commit 8a6fdec
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 3 deletions.
1 change: 1 addition & 0 deletions embed.fnc
Expand Up @@ -3260,6 +3260,7 @@ S |const char *|toggle_locale_i|const unsigned switch_cat_index \
|NN const char * new_locale
S |void |restore_toggled_locale_i|const unsigned cat_index \
|NULLOK const char * original_locale
S |bool |is_locale_utf8 |NN const char * locale
ST |bool |is_codeset_name_UTF8|NN const char * name
# ifdef USE_POSIX_2008_LOCALE
S |const char*|emulate_setlocale_i|const unsigned int index \
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -1713,6 +1713,7 @@
#define category_name S_category_name
#define get_category_index S_get_category_index
#define is_codeset_name_UTF8 S_is_codeset_name_UTF8
#define is_locale_utf8(a) S_is_locale_utf8(aTHX_ a)
#define new_LC_ALL(a) S_new_LC_ALL(aTHX_ a)
#define new_collate(a) S_new_collate(aTHX_ a)
#define new_ctype(a) S_new_ctype(aTHX_ a)
Expand Down
40 changes: 37 additions & 3 deletions locale.c
Expand Up @@ -1777,7 +1777,7 @@ S_new_ctype(pTHX_ const char *newctype)
PL_warn_locale = NULL;
}

PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);

/* A UTF-8 locale gets standard rules. But note that code still has to
* handle this specially because of the three problematic code points */
Expand Down Expand Up @@ -2147,7 +2147,7 @@ S_new_collate(pTHX_ const char *newcoll)
goto is_standard_collation;
}

PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
PL_in_utf8_COLLATE_locale = is_locale_utf8(newcoll);
PL_strxfrm_NUL_replacement = '\0';
PL_strxfrm_max_cp = 0;

Expand Down Expand Up @@ -5739,7 +5739,41 @@ S_is_codeset_name_UTF8(const char * name)
&& (len == 4 || name[3] == '-'));
}

#endif
STATIC bool
S_is_locale_utf8(pTHX_ const char * locale)
{
/* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
* my_langinfo() */

# if ! defined(USE_LOCALE_CTYPE) \
|| defined(EBCDIC) /* Imperfect proxy for os390, on which there aren't any
real UTF-8 locales at this time */

PERL_UNUSED_ARG(locale);

return FALSE;

# else

const char * scratch_buffer = NULL;
const char * codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
&scratch_buffer, NULL);
bool retval = is_codeset_name_UTF8(codeset);

PERL_ARGS_ASSERT_IS_LOCALE_UTF8;

DEBUG_Lv(PerlIO_printf(Perl_debug_log,
"%s: %d: found codeset=%s, is_utf8=%d\n",
__FILE__, __LINE__, codeset, retval));

Safefree(scratch_buffer);
return retval;

# endif

}

#endif /* USE_LOCALE */

bool
Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
Expand Down
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -5136,6 +5136,9 @@ STATIC unsigned int S_get_category_index(const int category, const char * locale
STATIC bool S_is_codeset_name_UTF8(const char * name);
#define PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8 \
assert(name)
STATIC bool S_is_locale_utf8(pTHX_ const char * locale);
#define PERL_ARGS_ASSERT_IS_LOCALE_UTF8 \
assert(locale)
STATIC void S_new_LC_ALL(pTHX_ const char* unused);
#define PERL_ARGS_ASSERT_NEW_LC_ALL
STATIC void S_new_collate(pTHX_ const char* newcoll);
Expand Down

0 comments on commit 8a6fdec

Please sign in to comment.