Skip to content

Commit

Permalink
XXXdelta Add Perl_langinfo8()
Browse files Browse the repository at this point in the history
This is like Perl_langinfo() but additionally returns information about
the UTF-8ness of the returned string.
  • Loading branch information
khwilliamson committed Apr 29, 2021
1 parent 1bde665 commit 4bc9011
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 101 deletions.
2 changes: 2 additions & 0 deletions embed.fnc
Expand Up @@ -1601,8 +1601,10 @@ ATdo |const char*|Perl_setlocale|const int category|NULLOK const char* locale
ATdo |HV * |Perl_localeconv
#if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H)
ATdo |const char*|Perl_langinfo|const nl_item item
ATdo |const char*|Perl_langinfo8|const nl_item item|NULLOK int * utf8ness
#else
ATdo |const char*|Perl_langinfo|const int item
ATdo |const char*|Perl_langinfo8|const int item|NULLOK int * utf8ness
#endif
pEX |int |mbtowc_|NULLOK const wchar_t * pwc|NULLOK const char * s|const Size_t len
CpO |int |init_i18nl10n |int printwarn
Expand Down
62 changes: 3 additions & 59 deletions ext/I18N-Langinfo/Langinfo.xs
Expand Up @@ -26,7 +26,7 @@ langinfo(code)
int code
PREINIT:
const char * value;
STRLEN len;
int is_utf8;
PROTOTYPE: _
CODE:
#ifdef HAS_NL_LANGINFO
Expand All @@ -36,64 +36,8 @@ langinfo(code)
} else
#endif
{
value = Perl_langinfo(code);
len = strlen(value);
RETVAL = newSVpvn(Perl_langinfo(code), len);

/* Now see if the UTF-8 flag should be turned on */
#ifdef USE_LOCALE_CTYPE /* No utf8 strings if not using LC_CTYPE */

/* If 'value' is ASCII or not legal UTF-8, the flag doesn't get
* turned on, so skip the followin code */
if (is_utf8_non_invariant_string((U8 *) value, len)) {
int category;

/* Check if the locale is a UTF-8 one. The returns from
* Perl_langinfo() are in different locale categories, so check the
* category corresponding to this item */
switch (code) {

/* This should always return ASCII, so we could instead
* legitimately panic here, but soldier on */
case CODESET:
category = LC_CTYPE;
break;

case RADIXCHAR:
case THOUSEP:
# ifdef USE_LOCALE_NUMERIC
category = LC_NUMERIC;
# else
/* Not ideal, but the best we can do on such a platform */
category = LC_CTYPE;
# endif
break;

case CRNCYSTR:
# ifdef USE_LOCALE_MONETARY
category = LC_MONETARY;
# else
category = LC_CTYPE;
# endif
break;

default:
# ifdef USE_LOCALE_TIME
category = LC_TIME;
# else
category = LC_CTYPE;
# endif
break;
}

/* Here the return is legal UTF-8. Turn on that flag if the
* locale is UTF-8. (Otherwise, could just be a coincidence.)
* */
if (_is_cur_LC_category_utf8(category)) {
SvUTF8_on(RETVAL);
}
}
#endif /* USE_LOCALE_CTYPE */
value = Perl_langinfo8(code, &is_utf8);
RETVAL = newSVpvn_utf8(value, strlen(value), is_utf8 > 1);
}

OUTPUT:
Expand Down
135 changes: 93 additions & 42 deletions locale.c
Expand Up @@ -3388,6 +3388,12 @@ S_get_nl_item_from_localeconv(pTHX_ const struct lconv *lcbuf,
#endif /* Has some form of localeconv() and paying attn to a category it
traffics in */

#ifndef HAS_SOME_LANGINFO

typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */

#endif

/*
=for apidoc Perl_langinfo
Expand All @@ -3398,20 +3404,69 @@ But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
of Perl's locale handling from your code, and can be used on systems that lack
a native C<nl_langinfo>.
Expanding on these:
However, you should instead use the improved version of this:
L</Perl_langinfo8>, which additionally returns to you if you should treat the
returned string as being encoded in UTF-8 or not.
=cut
*/

const char *
Perl_langinfo(const nl_item item)
{
return Perl_langinfo8(item, NULL);
}

/*
=for apidoc Perl_langinfo8
This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
except it returns a S<C<const char *>> instead of a plain S<C<char *>>, and
takes an extra (final) parameter that gives you information about the UTF8-ness
of the returned string. See L</Perl_langinfo> for a version without that
parameter.)
Otherwise it takes the same C<item> parameter values, and returns the same
information. But it is more thread-safe than regular C<nl_langinfo()>, and
hides the quirks of Perl's locale handling from your code, and can be portably
used on systems that lack a native C<nl_langinfo>.
If the final parameter, C<utf8ness> is not NULL, it is assumed to be a pointer
to a location which, upon return, will be set to indicate the UTF-8ness of the
'item's returned value, as follows:
=over
=item *
=item Z<>2
The returned string should definitely be treated as UTF-8.
The reason it isn't quite a drop-in replacement is actually an advantage. The
only difference is that it returns S<C<const char *>>, whereas plain
C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
forbidden to write into the buffer. By declaring this C<const>, the compiler
enforces this restriction, so if it is violated, you know at compilation time,
rather than getting segfaults at runtime.
=item Z<>0
=item *
The returned string should definitely NOT be treated as UTF-8.
=item Z<>1
The returne string is entirely ASCII, and so its representation is the same
whether encoded in UTF-8 or not. You may treat it whichever is most convenient
for you.
=back
Concerning the differences between this and plain C<nl_langinfo()>:
=over
=item a.
Besides the extra parameter, the other reason it isn't quite a drop-in
replacement is actually an advantage. The C<const>ness of the return allows the
compiler to catch attempts to write into the returned buffer, which is illegal
and could cause run-time crashes.
=item b.
It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
without you having to write extra code. The reason for the extra code would be
Expand All @@ -3425,50 +3480,54 @@ the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
(or equivalent) locale would break a lot of CPAN, which is expecting the radix
(decimal point) character to be a dot.)
=item *
=item c.
The system function it replaces can have its static return buffer trashed,
not only by a subsequent call to that function, but by a C<freelocale>,
C<setlocale>, or other locale change. The returned buffer of this function is
not changed until the next call to it, so the buffer is never in a trashed
state.
=item *
=item d.
Its return buffer is per-thread, so it also is never overwritten by a call to
this function from another thread; unlike the function it replaces.
=item *
=item e.
But most importantly, it works on systems that don't have C<nl_langinfo>, such
as Windows, hence makes your code more portable. Of the fifty-some possible
items specified by the POSIX 2008 standard,
L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
only one is completely unimplemented, though on non-Windows platforms, another
significant one is also not implemented). It uses various techniques to
significant one is not fully implemented). It uses various techniques to
recover the other items, including calling C<L<localeconv(3)>>, and
C<L<strftime(3)>>, both of which are specified in C89, so should be always be
available. Later C<strftime()> versions have additional capabilities; C<""> is
returned for those not available on your system.
returned for any item not available on your system.
It is important to note that when called with an item that is recovered by
It is important to note that, when called with an item that is recovered by
using C<localeconv>, the buffer from any previous explicit call to
C<localeconv> will be overwritten. This means you must save that buffer's
contents if you need to access them after a call to this function. (But note
that you might not want to be using C<localeconv()> directly anyway, because of
issues like the ones listed in the second item of this list (above) for
C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to
call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
unpack).
C<L<localeconv(3)>> will be overwritten. But you shouldn't be using
C<localeconv> anyway because it is is very much not thread-safe, and suffers
from the same problems outlined in item 'b.' above for the fields it returns that
are controlled by the LC_NUMERIC locale category. Instead, avoid all of those
problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
methods given in L<perlcall> to call
L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
=back
The details for those items which may deviate from what this emulation returns
and what a native C<nl_langinfo()> would return are specified in
L<I18N::Langinfo>.
=back
For backwards compatibility, there also is a plain L</Perl_langinfo> which
doesn't have the extra parameter, so takes the exact parameters as the system
one.
When using C<Perl_langinfo> on systems that don't have a native
C<nl_langinfo()>, you must
When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
have a native C<nl_langinfo()>, you must
#include "perl_langinfo.h"
Expand All @@ -3477,30 +3536,22 @@ C<#include> with this one. (Doing it this way keeps out the symbols that plain
C<langinfo.h> would try to import into the namespace for code that doesn't need
it.)
The original impetus for C<Perl_langinfo()> was so that code that needs to
find out the current currency symbol, floating point radix character, or digit
grouping separator can use, on all systems, the simpler and more
thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
pain to make thread-friendly. For other fields returned by C<localeconv>, it
is better to use the methods given in L<perlcall> to call
L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
=cut
*/

#ifndef HAS_SOME_LANGINFO

typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */

#endif

const char *
Perl_langinfo(const nl_item item)
Perl_langinfo8(const nl_item item, int * utf8ness)
{
dTHX;
unsigned cat_index;

PERL_ARGS_ASSERT_PERL_LANGINFO8;

if (utf8ness) { /* Assume for now */
*utf8ness = 1;
}

/* Find the locale category that controls the input 'item'. If we are not
* paying attention to that category, instead return a default value. Also
* return the default value if there is no way for us to figure out the
Expand Down Expand Up @@ -3680,11 +3731,11 @@ Perl_langinfo(const nl_item item)
/* Use either the underlying numeric, or the other underlying categories */
if (cat_index == LC_NUMERIC_INDEX_) {
return my_langinfo_c(item, LC_NUMERIC, USE_UNDERLYING_NUMERIC,
&PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
&PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
}
else {
return my_langinfo_i(item, cat_index, NULL,
&PL_langinfo_buf, &PL_langinfo_bufsize, NULL);
&PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
}

#endif
Expand Down
4 changes: 4 additions & 0 deletions proto.h
Expand Up @@ -4223,6 +4223,8 @@ STATIC int S_sv_2iuv_non_preserve(pTHX_ SV *const sv);
#if !(defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H))
PERL_CALLCONV const char* Perl_langinfo(const int item);
#define PERL_ARGS_ASSERT_PERL_LANGINFO
PERL_CALLCONV const char* Perl_langinfo8(const int item, int * utf8ness);
#define PERL_ARGS_ASSERT_PERL_LANGINFO8
#endif
#if !(defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L))
# if (defined(HAS_LOCALECONV) || defined(HAS_LOCALECONV_L)) && (defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_NUMERIC))
Expand Down Expand Up @@ -4727,6 +4729,8 @@ PERL_CALLCONV I32 Perl_do_shmio(pTHX_ I32 optype, SV** mark, SV** sp);
#if defined(HAS_NL_LANGINFO) && defined(PERL_LANGINFO_H)
PERL_CALLCONV const char* Perl_langinfo(const nl_item item);
#define PERL_ARGS_ASSERT_PERL_LANGINFO
PERL_CALLCONV const char* Perl_langinfo8(const nl_item item, int * utf8ness);
#define PERL_ARGS_ASSERT_PERL_LANGINFO8
#endif
#if defined(HAS_NL_LANGINFO) || defined(HAS_NL_LANGINFO_L)
# if defined(PERL_IN_LOCALE_C)
Expand Down

0 comments on commit 4bc9011

Please sign in to comment.