diff --git a/dist/ExtUtils-ParseXS/lib/perlxs.pod b/dist/ExtUtils-ParseXS/lib/perlxs.pod
index 4a339ddfd998..5aa215b3d441 100644
--- a/dist/ExtUtils-ParseXS/lib/perlxs.pod
+++ b/dist/ExtUtils-ParseXS/lib/perlxs.pod
@@ -603,7 +603,7 @@ and C<$type> can be used as in typemaps.
bool_t
rpcb_gettime(host,timep)
- char *host = (char *)SvPV_nolen($arg);
+ char *host = (char *)SvPVbyte_nolen($arg);
time_t &timep = 0;
OUTPUT:
timep
@@ -630,7 +630,7 @@ Here's a truly obscure example:
bool_t
rpcb_gettime(host,timep)
time_t &timep; /* \$v{timep}=@{[$v{timep}=$arg]} */
- char *host + SvOK($v{timep}) ? SvPV_nolen($arg) : NULL;
+ char *host + SvOK($v{timep}) ? SvPVbyte_nolen($arg) : NULL;
OUTPUT:
timep
@@ -993,7 +993,7 @@ The XS code, with ellipsis, follows.
char *host = "localhost";
CODE:
if( items > 1 )
- host = (char *)SvPV_nolen(ST(1));
+ host = (char *)SvPVbyte_nolen(ST(1));
RETVAL = rpcb_gettime( host, &timep );
OUTPUT:
timep
@@ -1294,7 +1294,7 @@ prototypes.
char *host = "localhost";
CODE:
if( items > 1 )
- host = (char *)SvPV_nolen(ST(1));
+ host = (char *)SvPVbyte_nolen(ST(1));
RETVAL = rpcb_gettime( host, &timep );
OUTPUT:
timep
diff --git a/dist/ExtUtils-ParseXS/lib/perlxstut.pod b/dist/ExtUtils-ParseXS/lib/perlxstut.pod
index 8e1372167073..fcafa58a81f8 100644
--- a/dist/ExtUtils-ParseXS/lib/perlxstut.pod
+++ b/dist/ExtUtils-ParseXS/lib/perlxstut.pod
@@ -1143,7 +1143,8 @@ Mytest.xs:
for (n = 0; n <= numpaths; n++) {
HV * rh;
STRLEN l;
- char * fn = SvPV(*av_fetch((AV *)SvRV(paths), n, 0), l);
+ SV * path = *av_fetch((AV *)SvRV(paths), n, 0);
+ char * fn = SvPVbyte(path, l);
i = statfs(fn, &buf);
if (i != 0) {
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 1f53eee14219..735e28a9ffe4 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -143,8 +143,13 @@ XXX
=head1 Documentation
-XXX Changes to files in F go here. Consider grouping entries by
-file and be sure to link to the appropriate page, e.g. L.
+L now explains in greater detail the need to consult SvUTF8
+when calling SvPV (or variants). A new "How do I pass a Perl string to a C
+library?" section in the same document discusses when to use which style of
+macro to read an SV's string value.
+
+L, L, L, and L now prefer SvPVbyte
+over SvPV.
=head2 New Documentation
diff --git a/pod/perlguts.pod b/pod/perlguts.pod
index 8d0b7894f07a..f1fd7da34af7 100644
--- a/pod/perlguts.pod
+++ b/pod/perlguts.pod
@@ -153,27 +153,74 @@ Perl's own functions typically add a trailing C for this reason.
Nevertheless, you should be very careful when you pass a string stored
in an SV to a C function or system call.
-To access the actual value that an SV points to, you can use the macros:
-
- SvIV(SV*)
- SvUV(SV*)
- SvNV(SV*)
- SvPV(SV*, STRLEN len)
- SvPV_nolen(SV*)
-
-which will automatically coerce the actual scalar type into an IV, UV, double,
-or string.
-
-In the C macro, the length of the string returned is placed into the
-variable C (this is a macro, so you do I use C<&len>). If you do
-not care what the length of the data is, use the C macro.
-Historically the C macro with the global variable C has been
-used in this case. But that can be quite inefficient because C must
+To access the actual value that an SV points to, Perl's API exposes
+several macros that coerce the actual scalar type into an IV, UV, double,
+or string:
+
+=over
+
+=item * C (C) and C (C)
+
+=item * C (C)
+
+=item * Strings are a bit complicated:
+
+=over
+
+=item * Byte string: C or C
+
+If the Perl string is C<"\xff\xff">, then this returns a 2-byte C.
+
+This is suitable for Perl strings that represent bytes.
+
+=item * UTF-8 string: C or C
+
+If the Perl string is C<"\xff\xff">, then this returns a 4-byte C.
+
+This is suitable for Perl strings that represent characters.
+
+B: That C will be encoded via Perl's internal UTF-8 variant,
+which means that if the SV contains non-Unicode code points (e.g.,
+0x110000), then the result may contain extensions over valid UTF-8.
+See L for some methods Perl gives
+you to check the UTF-8 validity of these macros' returns.
+
+=item * You can also use C or C
+to fetch the SV's raw internal buffer. This is tricky, though; if your Perl
+string
+is C<"\xff\xff">, then depending on the SV's internal encoding you might get
+back a 2-byte B a 4-byte C.
+Moreover, if it's the 4-byte string, that could come from either Perl
+C<"\xff\xff"> stored UTF-8 encoded, or Perl C<"\xc3\xbf\xc3\xbf"> stored
+as raw octets. To differentiate between these you B look up the
+SV's UTF8 bit (cf. C) to know whether the source Perl string
+is 2 characters (C would be on) or 4 characters (C would be
+off).
+
+B Use of C, C, or
+similarly-named macros I looking up the SV's UTF8 bit is
+almost certainly a bug if non-ASCII input is allowed.
+
+When the UTF8 bit is on, the same B about UTF-8 validity applies
+here as for C.
+
+=back
+
+(See L for more details.)
+
+In C, C, and C, the length of the C returned
+is placed into the
+variable C (these are macros, so you do I use C<&len>). If you do
+not care what the length of the data is, use C,
+C, or C instead.
+The global variable C can also be given to
+C/C/C
+in this case. But that can be quite inefficient because C must
be accessed in thread-local storage in threaded Perl. In any case, remember
that Perl allows arbitrary strings of data that may both contain NULs and
might not be terminated by a C.
-Also remember that C doesn't allow you to safely say C. It might work with your
compiler, but it won't work for everyone.
Break this sort of statement up into separate assignments:
@@ -181,9 +228,11 @@ Break this sort of statement up into separate assignments:
SV *s;
STRLEN len;
char *ptr;
- ptr = SvPV(s, len);
+ ptr = SvPVbyte(s, len);
foo(ptr, len);
+=back
+
If you want to know if the scalar value is TRUE, you can use:
SvTRUE(SV*)
@@ -200,7 +249,7 @@ add space for the trailing C byte (perl's own string functions typically do
C).
If you want to write to an existing SV's buffer and set its value to a
-string, use SvPV_force() or one of its variants to force the SV to be
+string, use SvPVbyte_force() or one of its variants to force the SV to be
a PV. This will remove any of various types of non-stringness from
the SV while preserving the content of the SV in the PV. This can be
used, for example, to append data from an API function to a buffer
@@ -3243,6 +3292,66 @@ There is no published API for dealing with this, as it is subject to
change, but you can look at the code for C in F for an
example as to how it's currently done.
+=head2 How do I pass a Perl string to a C library?
+
+A Perl string, conceptually, is an opaque sequence of code points.
+Many C libraries expect their inputs to be "classical" C strings, which are
+arrays of octets 1-255, terminated with a NUL byte. Your job when writing
+an interface between Perl and a C library is to define the mapping between
+Perl and that library.
+
+Generally speaking, C and related macros suit this task well.
+These assume that your Perl string is a "byte string", i.e., is either
+raw, undecoded input into Perl or is pre-encoded to, e.g., UTF-8.
+
+Alternatively, if your C library expects UTF-8 text, you can use
+C and related macros. This has the same effect as encoding
+to UTF-8 then calling the corresponding C-related macro.
+
+Some C libraries may expect other encodings (e.g., UTF-16LE). To give
+Perl strings to such libraries
+you must either do that encoding in Perl then use C, or
+use an intermediary C library to convert from however Perl stores the
+string to the desired encoding.
+
+Take care also that NULs in your Perl string don't confuse the C
+library. If possible, give the string's length to the C library; if that's
+not possible, consider rejecting strings that contain NUL bytes.
+
+=head3 What about C, C, etc.?
+
+Consider a 3-character Perl string C<$foo = "\x64\x78\x8c">.
+Perl can store these 3 characters either of two ways:
+
+=over
+
+=item * bytes: 0x64 0x78 0x8c
+
+=item * UTF-8: 0x64 0x78 0xc2 0x8c
+
+=back
+
+Now let's say you convert C<$foo> to a C string thus:
+
+ STRLEN strlen;
+ char *str = SvPV(foo_sv, strlen);
+
+At this point C could point to a 3-byte C string or a 4-byte one.
+
+Generally speaking, we want C to be the same regardless of how
+Perl stores C<$foo>, so the ambiguity here is undesirable. C
+and C solve that by giving predictable output: use
+C if your C library expects byte strings, or C
+if it expects UTF-8.
+
+If your C library happens to support both encodings, then C--always
+in tandem with lookups to C!--may be safe and (slightly) more
+efficient.
+
+B B Use L's C and C functions
+in your tests to ensure consistent handling regardless of Perl's
+internal encoding.
+
=head2 How do I convert a string to UTF-8?
If you're mixing UTF-8 and non-UTF-8 strings, it is necessary to upgrade
diff --git a/sv.h b/sv.h
index ba701ed1a6f9..240986ca33cf 100644
--- a/sv.h
+++ b/sv.h
@@ -801,7 +801,9 @@ compiler will complain if you were to try to modify the contents of the string,
(unless you cast away const yourself).
=for apidoc Am|STRLEN|SvCUR|SV* sv
-Returns the length of the string which is in the SV. See C>.
+Returns the length, in bytes, of the PV inside the SV.
+Note that this may not match Perl's C; for that, use
+C. See C> also.
=for apidoc Am|STRLEN|SvLEN|SV* sv
Returns the size of the string buffer in the SV, not including any part
@@ -855,8 +857,8 @@ Set the value of the MAGIC pointer in C to val. See C>.
Set the value of the STASH pointer in C to val. See C>.
=for apidoc Am|void|SvCUR_set|SV* sv|STRLEN len
-Set the current length of the string which is in the SV. See C>
-and C>.
+Sets the current length, in bytes, of the C string which is in the SV.
+See C> and C>.
=for apidoc Am|void|SvLEN_set|SV* sv|STRLEN len
Set the size of the string buffer for the SV. See C>.
@@ -1657,6 +1659,14 @@ see C>.
The differences between the forms are:
+The forms with neither C nor C in their names (e.g., C or
+C) can expose the SV's internal string buffer. If
+that buffer consists entirely of bytes 0-255 and includes any bytes above
+127, then you B consult C to determine the actual code points
+the string is meant to contain. Generally speaking, it is probably safer to
+prefer C, C, and the like. See
+L for more details.
+
The forms with C in their names allow you to use the C parameter
to specify to process 'get' magic (by setting the C flag) or to skip
'get' magic (by clearing it). The other forms process 'get' magic, except for