From ccda44cdb3b5006266cff79b4e49bfd45454c735 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Wed, 17 Feb 2021 09:56:06 -0700
Subject: [PATCH] locale.c: Improve non-nl_langinfo() CODESET calc

Prior to this commit, on non-Windows platforms that don't have a
nl_langinfo() libc function, the code completely punted computation of
the CODESET item.  I have not been able to figure out how to do this,
even going to the locale definition files on disk (which may vary
anyway), but we can do a lot better than punting.

This commit adds three checks:

1) If the locale name is C or POSIX, we know the codeset

2) We can detect if a locale is UTF-8.  If it is, that is the codeset.
Many modern locales are of this ilk.

3) Failing that, some locales have the codeset appear in the name,
following a dot.

It isn't perfect, but it's a lot better than completely punting.
---
 ext/I18N-Langinfo/Langinfo.pm | 18 +++++----
 locale.c                      | 73 ++++++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 29 deletions(-)
diff --git a/ext/I18N-Langinfo/Langinfo.pm b/ext/I18N-Langinfo/Langinfo.pm
index 7206e1ce859e..3cd71f0aa222 100644
--- a/ext/I18N-Langinfo/Langinfo.pm
+++ b/ext/I18N-Langinfo/Langinfo.pm
@@ -72,7 +72,7 @@ our @EXPORT_OK = qw(
 	YESSTR
 );
 
-our $VERSION = '0.19';
+our $VERSION = '0.20';
 
 XSLoader::load();
 
@@ -182,8 +182,11 @@ For the eras based on typically some ruler, such as the Japanese Emperor
 
 =head2 For systems without C<nl_langinfo>
 
-Starting in Perl 5.28, this module is available even on systems that lack a
-native C<nl_langinfo>.  On such systems, it uses various methods to construct
+This module originally was just a wrapper for the libc C<nl_langinfo>
+function, and did not work on systems lacking it, such as Windows.
+
+Starting in Perl 5.28, this module works on all platforms.  When
+C<nl_langinfo> is not available, it uses various methods to construct
 what that function, if present, would return.  But there are potential
 glitches.  These are the items that could be different:
 
@@ -195,8 +198,11 @@ Unimplemented, so returns C<"">.
 
 =item C<CODESET>
 
-Unimplemented, except on Windows, due to the vagaries of vendor locale names,
-returning C<""> on non-Windows.
+This should work properly for Windows platforms.  On almost all other modern
+platforms, it will reliably return "UTF-8" if that is the code set.
+Otherwise, it depends on the locale's name.  If that is of the form
+C<foo.bar>, it will assume C<bar> is the code set; and it also knows about the
+two locales "C" and "POSIX".  If none of those apply it returns C<"">.
 
 =item C<YESEXPR>
 
@@ -275,8 +281,6 @@ workaround for this; patches welcome: see L<perlapi/switch_to_global_locale>.
 
 L<perllocale>, L<POSIX/localeconv>, L<POSIX/setlocale>, L<nl_langinfo(3)>.
 
-The langinfo() function is just a wrapper for the C nl_langinfo() interface.
-
 =head1 AUTHOR
 
 Jarkko Hietaniemi, E<lt>jhi@hut.fiE<gt>.  Now maintained by Perl 5 porters.
diff --git a/locale.c b/locale.c
index 81cb2381fdf9..24322466631e 100644
--- a/locale.c
+++ b/locale.c
@@ -3130,6 +3130,8 @@ S_my_langinfo(pTHX_
 #  else   /* Below, emulate nl_langinfo as best we can */
 
     {
+        const char * locale;
+
 
 #    ifdef HAS_SOME_LOCALECONV
 
@@ -3511,19 +3513,13 @@ S_my_langinfo(pTHX_
 #  endif
 
       case CODESET:
+        locale = querylocale_c(LC_CTYPE);
 
-#  ifndef WIN32
-
-        /* On non-windows, this is unimplemented, in part because of
-         * inconsistencies between vendors.  The Darwin native
-         * nl_langinfo() implementation simply looks at everything past
-         * any dot in the name, but that doesn't work for other
-         * vendors.  Many Linux locales that don't have UTF-8 in their
-         * names really are UTF-8, for example; z/OS locales that do
-         * have UTF-8 in their names, aren't really UTF-8 */
-        return "";
+        if (isNAME_C_OR_POSIX(locale)) {
+            return C_codeset;
+        }
 
-#  else
+#    ifdef WIN32
 
         {
             /* This function retrieves the code page.  It is subject to change,
@@ -3539,14 +3535,42 @@ S_my_langinfo(pTHX_
 
 #  endif
 
-        {   /* Temporarily unreachable */
-            const char * name = querylocale_c(LC_CTYPE);
+        /* The codeset is important, but khw did not figure out a way for it to
+         * be retrieved without nl_langinfo() (or the function above on
+         * Windows).  But even if we can't get it directly, we can usually
+         * determine if it is a UTF-8 locale or not.  If it is UTF-8, we
+         * (correctly) use that for the code set.  If not, perhaps the code set
+         * will be in the name, like "foo.8859-6" */
+
+#    if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
 
-            if (isNAME_C_OR_POSIX(name)) {
-                return C_codeset;
+        {
+            /* These functions weren't in the published C89 standard, but were
+             * added soon after, so that many sources consider them to be C89,
+             * and are likely available in a compiler that claims to support
+             * C89. */
+
+            wchar_t wc;
+            int mbtowc_ret;
+
+            (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);    /* Reset shift state */
+            mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
+                                     STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
+            if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
+                return "UTF-8";
+            }
             }
 
-            retval = (const char *) strchr(name, '.');
+        /* Otherwise drop down to try to get the code set from the locale name.
+         * */
+
+#    endif
+
+        /* Here we know it isn't a UTF-8 locale (if mbtowc() was available on
+         * the platform).  All that is left us is looking at the locale name.
+         *
+         * Find any dot in the locale name */
+        retval = (const char *) strchr(locale, '.');
             if (! retval) {
                 return "";  /* Alas, no dot */
                 }
@@ -3554,21 +3578,26 @@ S_my_langinfo(pTHX_
             /* Use everything past the dot */
             retval++;
 
-            retval = save_to_buffer(retval, retbufp, retbuf_sizep);
-        }
+#    if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
 
-        break;
+        /* Here, we know that the locale did not act like a proper UTF-8 one.
+         * So if it claims to be UTF-8, it is a lie */
+        if (is_codeset_name_UTF8(retval)) {
+            return "";
+        }
 
 #  endif
 
-        }
+        return save_to_buffer(retval, retbufp, retbuf_sizep);
+    } /* Giant switch() of nl_langinfo() items */
     }
 
     return retval;
 
-#  endif
+#  endif    /* All the implementations of my_langinfo() */
 /*--------------------------------------------------------------------------*/
-}
+
+}   /* my_langinfo() */
 
 #endif      /* USE_LOCALE */