XXXdelta Add locale unsupported test

Perl only suppots multi-byte locales that are UTF-8. It turns out that the others are worse than I thought, and if someone switches to one, the program can crash. This commit generates a default-on diagnostic when switching into such a locale, and doesn't take the steps to add its data to the LC_CTYPE database. The check has been done in various releases for some time, but this elevates its severity and tries to avoid actually using its data. The previous commit caused the test suite to view such a locale as unacceptable.
Perl · Apr 20, 2021 · 3847f52 · 3847f52
1 parent ac84d2a
commit 3847f52
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 46 deletions.
diff --git a/locale.c b/locale.c
@@ -2071,6 +2071,36 @@ S_new_ctype(pTHX_ const char *newctype)
         }
     }
 
+#    ifdef MB_CUR_MAX
+
+    /* We only handle single-byte locales (outside of UTF-8 ones; so if this
+     * locale requires more than one byte, there are going to be BIG problems.
+     * */
+
+    if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
+
+            /* Some platforms return MB_CUR_MAX > 1 for even the "C"
+            * locale.  Just assume that the implementation for them (plus
+            * for POSIX) is correct and the > 1 value is spurious.  (Since
+            * these are specially handled to never be considered UTF-8
+            * locales, as long as this is the only problem, everything
+            * should work fine */
+        && strNE(newctype, "C") && strNE(newctype, "POSIX"))
+    {
+        DEBUG_L(PerlIO_printf(Perl_debug_log,
+                              "Unsupported, MB_CUR_MAX=%zu\n", MB_CUR_MAX));
+
+        Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
+                         "Locale '%s' is unsupported, and may crash the"
+                         " interpreter.\n",
+                         newctype);
+    }
+
+#    endif
+
+    DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
+                                           check_for_problems));
+
     /* We don't populate the other lists if a UTF-8 locale, but do check that
      * everything works as expected, unless checking turned off */
     if (check_for_problems) {
@@ -2079,8 +2109,6 @@ S_new_ctype(pTHX_ const char *newctype)
          * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
          * NUL */
         char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
-        bool multi_byte_locale = FALSE;     /* Assume is a single-byte locale
-                                               to start */
         unsigned int bad_count = 0;         /* Count of bad characters */
 
         for (i = 0; i < 256; i++) {
@@ -2212,37 +2240,9 @@ S_new_ctype(pTHX_ const char *newctype)
             PL_in_utf8_turkic_locale = TRUE;
             DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
         }
-        else {
-            PL_in_utf8_turkic_locale = FALSE;
-        }
-
-#  ifdef MB_CUR_MAX
-
-        /* We only handle single-byte locales (outside of UTF-8 ones; so if
-         * this locale requires more than one byte, there are going to be
-         * problems. */
-        DEBUG_Lv(PerlIO_printf(Perl_debug_log,
-                 "check_for_problems=%d, MB_CUR_MAX=%d\n",
-                 check_for_problems, (int) MB_CUR_MAX));
-
-        if (   check_for_problems && MB_CUR_MAX > 1
-            && ! PL_in_utf8_CTYPE_locale
-
-               /* Some platforms return MB_CUR_MAX > 1 for even the "C"
-                * locale.  Just assume that the implementation for them (plus
-                * for POSIX) is correct and the > 1 value is spurious.  (Since
-                * these are specially handled to never be considered UTF-8
-                * locales, as long as this is the only problem, everything
-                * should work fine */
-            && strNE(newctype, "C") && strNE(newctype, "POSIX"))
-        {
-            multi_byte_locale = TRUE;
-        }
-
-#  endif
 
         /* If we found problems and we want them output, do so */
-        if (   (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
+        if (   (UNLIKELY(bad_count))
             && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
         {
             if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
@@ -2253,21 +2253,12 @@ S_new_ctype(pTHX_ const char *newctype)
                       newctype, bad_chars_list);
             }
             else {
-                PL_warn_locale = Perl_newSVpvf(aTHX_
-                             "Locale '%s' may not work well.%s%s%s\n",
-                             newctype,
-                             (multi_byte_locale)
-                              ? "  Some characters in it are not recognized by"
-                                " Perl."
-                              : "",
-                             (bad_count)
-                              ? "\nThe following characters (and maybe others)"
-                                " may not have the same meaning as the Perl"
-                                " program expects:\n"
-                              : "",
-                             (bad_count)
-                              ? bad_chars_list
-                              : ""
+                PL_warn_locale =
+                    Perl_newSVpvf(aTHX_
+                                  "\nThe following characters (and maybe"
+                                  " others) may not have the same meaning as"
+                                  " the Perl program expects: %s\n",
+                                  bad_chars_list
                             );
             }
 

diff --git a/pod/perldiag.pod b/pod/perldiag.pod
@@ -3447,6 +3447,17 @@ the first approach, not using C<ispunct()> at all (see L<Note [5] in
 perlrecharclass|perlrecharclass/[5]>), and this message is raised to notify you that you
 are getting Perl's approach, not the locale's.
 
+=item Locale '%s' is unsupported, and may crash the interpreter.
+
+(S locale) The named locale is not supported by Perl, and using it leads
+to undefined behavior, including potentially crashing the computer.
+
+Currently the only locales that generate this severe warning are those
+which have characters that require more than one byte to represent
+(common in older East Asian language locales).  The only locales with
+this characteristic that Perl can handle are UTF-8 locales.  See
+L<perllocale>.
+
 =item Locale '%s' may not work well.%s
 
 (W locale) You are using the named locale, which is a non-UTF-8 one, and