From 30bce767d39e4f10a6402ff29f510ec8a4e595dd Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 10 Apr 2021 07:19:37 -0600
Subject: [PATCH] locale.c: Add fallbacks if no mbtowc()

This add heuristics that work well for non-English locales to determine
if a locale is UTF-8 or not when mbtowc() isn't available.  It would be
a very rare compiler that didn't have that these days, but this covers
that case as best as I have been able to figure out.
---
 locale.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 3 deletions(-)

diff --git a/locale.c b/locale.c
index 939c22c53111..4b41d0666e84 100644
--- a/locale.c
+++ b/locale.c
@@ -4211,10 +4211,107 @@ S_my_langinfo_i(pTHX_
         /* Otherwise drop down to try to get the code set from the locale name.
          * */
 
+#    else
+
+        {
+            /* Here, mbtowc() is not available.  Sling together several
+             * possibilities, depending on platform capabilities and what we
+             * found.
+             *
+             * We likely will find if a platform is UTF-8 or not for
+             * non-English locales */
+
+            int locale_is_utf8 = 1;
+            const char * scratch_buf = NULL;
+            Size_t scratch_buf_size = 0;
+
+#      ifdef USE_LOCALE_MONETARY
+
+            /* First try looking at the currency symbol (via a recursive call)
+             * to see if it disambiguates things.  Often that will be in the
+             * native script, and if the symbol isn't legal UTF-8, we know that
+             * the locale isn't either. */
+            (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale,
+                              &scratch_buf, &scratch_buf_size, &locale_is_utf8);
+
 #    endif
+#      if defined(USE_LOCALE_TIME) && defined(HAS_STRFTIME)
+
+            /* If that didn't rule out being UTF-8, we look at LC_TIME entries,
+             * like the names of the months or weekdays.  We quit at the first
+             * one that is illegal UTF-8 */
+            if (locale_is_utf8 != 0) {
+                int this_is_utf8;
+                unsigned int i;
+                const char * orig_switched_locale;
+                const int times[] = {
+                    DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
+                    MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
+                                                MON_9, MON_10, MON_11, MON_12,
+                    ALT_DIGITS, AM_STR, PM_STR,
+                    ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
+                                                                 ABDAY_7,
+                    ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
+                    ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
+                };
+
+                /* The code in the recursive call can handle switching the
+                 * locales, but by doing it here, we avoid repeated switching
+                 * in the loop */
+                orig_switched_locale = toggle_locale_c(LC_TIME, locale);
+
+                for (i = 0; i < C_ARRAY_LENGTH(times); i++) {
+                    (void) my_langinfo_c(times[i], LC_TIME, NULL,
+                                       &scratch_buf, &scratch_buf_size,
+                                       &this_is_utf8);
+                    if (this_is_utf8 == 0) {
+                        break;
+                    }
+                    else if (this_is_utf8 == 2) {
+                        locale_is_utf8 = 2;
+                    }
+                }
+                restore_toggled_locale_c(LC_TIME, orig_switched_locale);
 
-        /* Here we know it isn't a UTF-8 locale (if mbtowc() was available on
-         * the platform).  All that is left us is looking at the locale name.
+                /* Here we have gone through all the LC_TIME elements.  If any
+                 * aren't legal UTF-8, locale_is_utf8==0; otherwise if any are
+                 * non-ASCII UTF-8, locale_is_utf8==2.  That means
+                 * locale_is_utf8==1 iff all were ASCII */
+            }
+
+#      endif    /* LC_TIME */
+
+            Safefree(scratch_buf);
+
+            /* We could also examine the LC_MESSAGE errno strings for more
+             * evidence, but experience has shown that many systems don't
+             * actually have translations for them from the original English,
+             * so everything in them is ASCII, which is of no help to us.  A
+             * Configure probe could possibly be written to see if this
+             * platform has non-ASCII error messages.  But given the fact that
+             * we would be doing this only on compilers that aren't full C89,
+             * those are the systems that wouldn't have translations anyway. */
+
+            /* Here we have figured out, to the best of our ability, if the
+             * locale is or isn't UTF-8.  The result will very likely be
+             * correct for non-English locales unless (uncommonly) the
+             * language's script is entirely ASCII (and unless we don't have
+             * access to LC_TIME).  But, otherwise, it comes down to if the
+             * locale's name ends in something like "UTF-8". */
+            if (locale_is_utf8 == 2) {
+                return "UTF-8";
+            }
+        }
+
+#    endif    /* ! mbtowc() */
+
+        /* Rejoin the mbtowc available/not-available cases.
+         *
+         * Here we either know it isn't a UTF-8 locale (if mbtowc() was
+         * available on the platform), or in examining all the locale-dependent
+         * strings khw could think of, all were ASCII.  Return the codeset as
+         * derived from the locale name, which is very less than ideal; often
+         * there is no code set in the name; and at other times they even lie.
          *
          * Find any dot in the locale name */
         retval = (const char *) strchr(locale, '.');
@@ -4236,11 +4333,13 @@ S_my_langinfo_i(pTHX_
 #  endif
 
         return save_to_buffer(retval, retbufp, retbuf_sizep);
+
     } /* Giant switch() of nl_langinfo() items */
 
     return retval;
 
 #  endif    /* All the implementations of my_langinfo() */
+
 /*--------------------------------------------------------------------------*/
 
 }   /* my_langinfo() */
@@ -6369,7 +6468,12 @@ STATIC bool
 S_is_locale_utf8(pTHX_ const char * locale)
 {
     /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise.  It uses
-     * my_langinfo() */
+     * my_langinfo(), which employs various methods to get this information
+     * if nl_langinfo() isn't available, using heuristics as a last resort, in
+     * which case, the result will very likely be correct for locales for
+     * languages that have commonly used non-ASCII characters, but for notably
+     * English, it comes down to if the locale's name ends in something like
+     * "UTF-8".  It errs on the side of not being a UTF-8 locale. */
 
 #  if ! defined(USE_LOCALE_CTYPE)                                             \
    ||   defined(EBCDIC) /* Imperfect proxy for os390, on which there aren't any