From ccda44cdb3b5006266cff79b4e49bfd45454c735 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 17 Feb 2021 09:56:06 -0700 Subject: [PATCH] locale.c: Improve non-nl_langinfo() CODESET calc Prior to this commit, on non-Windows platforms that don't have a nl_langinfo() libc function, the code completely punted computation of the CODESET item. I have not been able to figure out how to do this, even going to the locale definition files on disk (which may vary anyway), but we can do a lot better than punting. This commit adds three checks: 1) If the locale name is C or POSIX, we know the codeset 2) We can detect if a locale is UTF-8. If it is, that is the codeset. Many modern locales are of this ilk. 3) Failing that, some locales have the codeset appear in the name, following a dot. It isn't perfect, but it's a lot better than completely punting. --- ext/I18N-Langinfo/Langinfo.pm | 18 +++++---- locale.c | 73 ++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/ext/I18N-Langinfo/Langinfo.pm b/ext/I18N-Langinfo/Langinfo.pm index 7206e1ce859e..3cd71f0aa222 100644 --- a/ext/I18N-Langinfo/Langinfo.pm +++ b/ext/I18N-Langinfo/Langinfo.pm @@ -72,7 +72,7 @@ our @EXPORT_OK = qw( YESSTR ); -our $VERSION = '0.19'; +our $VERSION = '0.20'; XSLoader::load(); @@ -182,8 +182,11 @@ For the eras based on typically some ruler, such as the Japanese Emperor =head2 For systems without C -Starting in Perl 5.28, this module is available even on systems that lack a -native C. On such systems, it uses various methods to construct +This module originally was just a wrapper for the libc C +function, and did not work on systems lacking it, such as Windows. + +Starting in Perl 5.28, this module works on all platforms. When +C is not available, it uses various methods to construct what that function, if present, would return. But there are potential glitches. These are the items that could be different: @@ -195,8 +198,11 @@ Unimplemented, so returns C<"">. =item C -Unimplemented, except on Windows, due to the vagaries of vendor locale names, -returning C<""> on non-Windows. +This should work properly for Windows platforms. On almost all other modern +platforms, it will reliably return "UTF-8" if that is the code set. +Otherwise, it depends on the locale's name. If that is of the form +C, it will assume C is the code set; and it also knows about the +two locales "C" and "POSIX". If none of those apply it returns C<"">. =item C @@ -275,8 +281,6 @@ workaround for this; patches welcome: see L. L, L, L, L. -The langinfo() function is just a wrapper for the C nl_langinfo() interface. - =head1 AUTHOR Jarkko Hietaniemi, Ejhi@hut.fiE. Now maintained by Perl 5 porters. diff --git a/locale.c b/locale.c index 81cb2381fdf9..24322466631e 100644 --- a/locale.c +++ b/locale.c @@ -3130,6 +3130,8 @@ S_my_langinfo(pTHX_ # else /* Below, emulate nl_langinfo as best we can */ { + const char * locale; + # ifdef HAS_SOME_LOCALECONV @@ -3511,19 +3513,13 @@ S_my_langinfo(pTHX_ # endif case CODESET: + locale = querylocale_c(LC_CTYPE); -# ifndef WIN32 - - /* On non-windows, this is unimplemented, in part because of - * inconsistencies between vendors. The Darwin native - * nl_langinfo() implementation simply looks at everything past - * any dot in the name, but that doesn't work for other - * vendors. Many Linux locales that don't have UTF-8 in their - * names really are UTF-8, for example; z/OS locales that do - * have UTF-8 in their names, aren't really UTF-8 */ - return ""; + if (isNAME_C_OR_POSIX(locale)) { + return C_codeset; + } -# else +# ifdef WIN32 { /* This function retrieves the code page. It is subject to change, @@ -3539,14 +3535,42 @@ S_my_langinfo(pTHX_ # endif - { /* Temporarily unreachable */ - const char * name = querylocale_c(LC_CTYPE); + /* The codeset is important, but khw did not figure out a way for it to + * be retrieved without nl_langinfo() (or the function above on + * Windows). But even if we can't get it directly, we can usually + * determine if it is a UTF-8 locale or not. If it is UTF-8, we + * (correctly) use that for the code set. If not, perhaps the code set + * will be in the name, like "foo.8859-6" */ + +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) - if (isNAME_C_OR_POSIX(name)) { - return C_codeset; + { + /* These functions weren't in the published C89 standard, but were + * added soon after, so that many sources consider them to be C89, + * and are likely available in a compiler that claims to support + * C89. */ + + wchar_t wc; + int mbtowc_ret; + + (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0); /* Reset shift state */ + mbtowc_ret = Perl_mbtowc_(aTHX_ &wc, + STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) { + return "UTF-8"; + } } - retval = (const char *) strchr(name, '.'); + /* Otherwise drop down to try to get the code set from the locale name. + * */ + +# endif + + /* Here we know it isn't a UTF-8 locale (if mbtowc() was available on + * the platform). All that is left us is looking at the locale name. + * + * Find any dot in the locale name */ + retval = (const char *) strchr(locale, '.'); if (! retval) { return ""; /* Alas, no dot */ } @@ -3554,21 +3578,26 @@ S_my_langinfo(pTHX_ /* Use everything past the dot */ retval++; - retval = save_to_buffer(retval, retbufp, retbuf_sizep); - } +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) - break; + /* Here, we know that the locale did not act like a proper UTF-8 one. + * So if it claims to be UTF-8, it is a lie */ + if (is_codeset_name_UTF8(retval)) { + return ""; + } # endif - } + return save_to_buffer(retval, retbufp, retbuf_sizep); + } /* Giant switch() of nl_langinfo() items */ } return retval; -# endif +# endif /* All the implementations of my_langinfo() */ /*--------------------------------------------------------------------------*/ -} + +} /* my_langinfo() */ #endif /* USE_LOCALE */