XXXdelta Add Perl_langinfo8()

This is like Perl_langinfo() but additionally returns information about the UTF-8ness of the returned string.
Perl · Apr 29, 2021 · 4bc9011 · 4bc9011
1 parent 1bde665
commit 4bc9011
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 101 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -1601,8 +1601,10 @@ ATdo	|const char*|Perl_setlocale|const int category|NULLOK const char* locale
 ATdo	|HV *	|Perl_localeconv
 #if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H)
 ATdo	|const char*|Perl_langinfo|const nl_item item
+ATdo	|const char*|Perl_langinfo8|const nl_item item|NULLOK int * utf8ness
 #else
 ATdo	|const char*|Perl_langinfo|const int item
+ATdo	|const char*|Perl_langinfo8|const int item|NULLOK int * utf8ness
 #endif
 pEX	|int	|mbtowc_|NULLOK const wchar_t * pwc|NULLOK const char * s|const Size_t len
 CpO	|int	|init_i18nl10n	|int printwarn

diff --git a/ext/I18N-Langinfo/Langinfo.xs b/ext/I18N-Langinfo/Langinfo.xs
@@ -26,7 +26,7 @@ langinfo(code)
 	int	code
   PREINIT:
         const   char * value;
-        STRLEN  len;
+        int     is_utf8;
   PROTOTYPE: _
   CODE:
 #ifdef HAS_NL_LANGINFO
@@ -36,64 +36,8 @@ langinfo(code)
 	} else
 #endif
         {
-            value = Perl_langinfo(code);
-            len = strlen(value);
-            RETVAL = newSVpvn(Perl_langinfo(code), len);
-
-            /* Now see if the UTF-8 flag should be turned on */
-#ifdef USE_LOCALE_CTYPE     /* No utf8 strings if not using LC_CTYPE */
-
-            /* If 'value' is ASCII or not legal UTF-8, the flag doesn't get
-             * turned on, so skip the followin code */
-            if (is_utf8_non_invariant_string((U8 *) value, len)) {
-                int category;
-
-                /* Check if the locale is a UTF-8 one.  The returns from
-                 * Perl_langinfo() are in different locale categories, so check the
-                 * category corresponding to this item */
-                switch (code) {
-
-                    /* This should always return ASCII, so we could instead
-                     * legitimately panic here, but soldier on */
-                    case CODESET:
-                        category = LC_CTYPE;
-                        break;
-
-                    case RADIXCHAR:
-                    case THOUSEP:
-#  ifdef USE_LOCALE_NUMERIC
-                        category = LC_NUMERIC;
-#  else
-                        /* Not ideal, but the best we can do on such a platform */
-                        category = LC_CTYPE;
-#  endif
-                        break;
-
-                    case CRNCYSTR:
-#  ifdef USE_LOCALE_MONETARY
-                        category = LC_MONETARY;
-#  else
-                        category = LC_CTYPE;
-#  endif
-                        break;
-
-                    default:
-#  ifdef USE_LOCALE_TIME
-                        category = LC_TIME;
-#  else
-                        category = LC_CTYPE;
-#  endif
-                        break;
-                }
-
-                /* Here the return is legal UTF-8.  Turn on that flag if the
-                 * locale is UTF-8.  (Otherwise, could just be a coincidence.)
-                 * */
-                if (_is_cur_LC_category_utf8(category)) {
-                    SvUTF8_on(RETVAL);
-                }
-            }
-#endif /* USE_LOCALE_CTYPE */
+            value = Perl_langinfo8(code, &is_utf8);
+            RETVAL = newSVpvn_utf8(value, strlen(value), is_utf8 > 1);
         }
 
   OUTPUT:

diff --git a/locale.c b/locale.c
@@ -3388,6 +3388,12 @@ S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
 #endif      /*   Has some form of localeconv() and paying attn to a category it
                  traffics in */
 
+#ifndef HAS_SOME_LANGINFO
+
+typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
+
+#endif
+
 /*
 
 =for apidoc Perl_langinfo
@@ -3398,20 +3404,69 @@ But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
 of Perl's locale handling from your code, and can be used on systems that lack
 a native C<nl_langinfo>.
 
-Expanding on these:
+However, you should instead use the improved version of this:
+L</Perl_langinfo8>, which additionally returns to you if you should treat the
+returned string as being encoded in UTF-8 or not.
+
+=cut
+
+*/
+
+const char *
+Perl_langinfo(const nl_item item)
+{
+    return Perl_langinfo8(item, NULL);
+}
+
+/*
+
+=for apidoc Perl_langinfo8
+
+This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
+except it returns a S<C<const char *>> instead of a plain S<C<char *>>, and
+takes an extra (final) parameter that gives you information about the UTF8-ness
+of the returned string.  See L</Perl_langinfo> for a version without that
+parameter.)
+
+Otherwise it takes the same C<item> parameter values, and returns the same
+information.  But it is more thread-safe than regular C<nl_langinfo()>, and
+hides the quirks of Perl's locale handling from your code, and can be portably
+used on systems that lack a native C<nl_langinfo>.
+
+If the final parameter, C<utf8ness> is not NULL, it is assumed to be a pointer
+to a location which, upon return, will be set to indicate the UTF-8ness of the
+'item's returned value, as follows:
 
 =over
 
-=item *
+=item Z<>2
+
+The returned string should definitely be treated as UTF-8.
 
-The reason it isn't quite a drop-in replacement is actually an advantage.  The
-only difference is that it returns S<C<const char *>>, whereas plain
-C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
-forbidden to write into the buffer.  By declaring this C<const>, the compiler
-enforces this restriction, so if it is violated, you know at compilation time,
-rather than getting segfaults at runtime.
+=item Z<>0
 
-=item *
+The returned string should definitely NOT be treated as UTF-8.
+
+=item Z<>1
+
+The returne string is entirely ASCII, and so its representation is the same
+whether encoded in UTF-8 or not.  You may treat it whichever is most convenient
+for you.
+
+=back
+
+Concerning the differences between this and plain C<nl_langinfo()>:
+
+=over
+
+=item a.
+
+Besides the extra parameter, the other reason it isn't quite a drop-in
+replacement is actually an advantage. The C<const>ness of the return allows the
+compiler to catch attempts to write into the returned buffer, which is illegal
+and could cause run-time crashes.
+
+=item b.
 
 It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
 without you having to write extra code.  The reason for the extra code would be
@@ -3425,50 +3480,54 @@ the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
 (or equivalent) locale would break a lot of CPAN, which is expecting the radix
 (decimal point) character to be a dot.)
 
-=item *
+=item c.
 
 The system function it replaces can have its static return buffer trashed,
 not only by a subsequent call to that function, but by a C<freelocale>,
 C<setlocale>, or other locale change.  The returned buffer of this function is
 not changed until the next call to it, so the buffer is never in a trashed
 state.
 
-=item *
+=item d.
 
 Its return buffer is per-thread, so it also is never overwritten by a call to
 this function from another thread;  unlike the function it replaces.
 
-=item *
+=item e.
 
 But most importantly, it works on systems that don't have C<nl_langinfo>, such
 as Windows, hence makes your code more portable.  Of the fifty-some possible
 items specified by the POSIX 2008 standard,
 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
 only one is completely unimplemented, though on non-Windows platforms, another
-significant one is also not implemented).  It uses various techniques to
+significant one is not fully implemented).  It uses various techniques to
 recover the other items, including calling C<L<localeconv(3)>>, and
 C<L<strftime(3)>>, both of which are specified in C89, so should be always be
 available.  Later C<strftime()> versions have additional capabilities; C<""> is
-returned for those not available on your system.
+returned for any item not available on your system.
 
-It is important to note that when called with an item that is recovered by
+It is important to note that, when called with an item that is recovered by
 using C<localeconv>, the buffer from any previous explicit call to
-C<localeconv> will be overwritten.  This means you must save that buffer's
-contents if you need to access them after a call to this function.  (But note
-that you might not want to be using C<localeconv()> directly anyway, because of
-issues like the ones listed in the second item of this list (above) for
-C<RADIXCHAR> and C<THOUSEP>.  You can use the methods given in L<perlcall> to
-call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
-unpack).
+C<L<localeconv(3)>> will be overwritten.  But you shouldn't be using
+C<localeconv> anyway because it is is very much not thread-safe, and suffers
+from the same problems outlined in item 'b.' above for the fields it returns that
+are controlled by the LC_NUMERIC locale category.  Instead, avoid all of those
+problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
+methods given in L<perlcall>  to call
+L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
+
+=back
 
 The details for those items which may deviate from what this emulation returns
 and what a native C<nl_langinfo()> would return are specified in
 L<I18N::Langinfo>.
 
-=back
+For backwards compatibility, there also is a plain L</Perl_langinfo> which
+doesn't have the extra parameter, so takes the exact parameters as the system
+one.
 
-When using C<Perl_langinfo> on systems that don't have a native
-C<nl_langinfo()>, you must
+When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
+have a native C<nl_langinfo()>, you must
 
  #include "perl_langinfo.h"
 
@@ -3477,30 +3536,22 @@ C<#include> with this one.  (Doing it this way keeps out the symbols that plain
 C<langinfo.h> would try to import into the namespace for code that doesn't need
 it.)
 
-The original impetus for C<Perl_langinfo()> was so that code that needs to
-find out the current currency symbol, floating point radix character, or digit
-grouping separator can use, on all systems, the simpler and more
-thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
-pain to make thread-friendly.  For other fields returned by C<localeconv>, it
-is better to use the methods given in L<perlcall> to call
-L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
-
 =cut
 
 */
 
-#ifndef HAS_SOME_LANGINFO
-
-typedef int nl_item;    /* Substitute 'int' for emulated nl_langinfo() */
-
-#endif
-
 const char *
-Perl_langinfo(const nl_item item)
+Perl_langinfo8(const nl_item item, int * utf8ness)
 {
     dTHX;
     unsigned cat_index;
 
+    PERL_ARGS_ASSERT_PERL_LANGINFO8;
+
+    if (utf8ness) {     /* Assume for now */
+        *utf8ness = 1;
+    }
+
     /* Find the locale category that controls the input 'item'.  If we are not
      * paying attention to that category, instead return a default value.  Also
      * return the default value if there is no way for us to figure out the
@@ -3680,11 +3731,11 @@ Perl_langinfo(const nl_item item)
     /* Use either the underlying numeric, or the other underlying categories */
     if (cat_index == LC_NUMERIC_INDEX_) {
         return my_langinfo_c(item, LC_NUMERIC, USE_UNDERLYING_NUMERIC,
-                             &PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
+                             &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
     }
     else {
         return my_langinfo_i(item, cat_index, NULL,
-                             &PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
+                             &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
     }
 
 #endif

diff --git a/proto.h b/proto.h
@@ -4223,6 +4223,8 @@ STATIC int	S_sv_2iuv_non_preserve(pTHX_ SV *const sv);
 #if !(defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H))
 PERL_CALLCONV const char*	Perl_langinfo(const int item);
 #define PERL_ARGS_ASSERT_PERL_LANGINFO
+PERL_CALLCONV const char*	Perl_langinfo8(const int item, int * utf8ness);
+#define PERL_ARGS_ASSERT_PERL_LANGINFO8
 #endif
 #if !(defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L))
 #  if (defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L))	       && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
@@ -4727,6 +4729,8 @@ PERL_CALLCONV I32	Perl_do_shmio(pTHX_ I32 optype, SV** mark, SV** sp);
 #if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H)
 PERL_CALLCONV const char*	Perl_langinfo(const nl_item item);
 #define PERL_ARGS_ASSERT_PERL_LANGINFO
+PERL_CALLCONV const char*	Perl_langinfo8(const nl_item item, int * utf8ness);
+#define PERL_ARGS_ASSERT_PERL_LANGINFO8
 #endif
 #if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L)
 #  if defined(PERL_IN_LOCALE_C)