diff --git a/dist/ExtUtils-ParseXS/lib/perlxs.pod b/dist/ExtUtils-ParseXS/lib/perlxs.pod index 4a339ddfd998..5aa215b3d441 100644 --- a/dist/ExtUtils-ParseXS/lib/perlxs.pod +++ b/dist/ExtUtils-ParseXS/lib/perlxs.pod @@ -603,7 +603,7 @@ and C<$type> can be used as in typemaps. bool_t rpcb_gettime(host,timep) - char *host = (char *)SvPV_nolen($arg); + char *host = (char *)SvPVbyte_nolen($arg); time_t &timep = 0; OUTPUT: timep @@ -630,7 +630,7 @@ Here's a truly obscure example: bool_t rpcb_gettime(host,timep) time_t &timep; /* \$v{timep}=@{[$v{timep}=$arg]} */ - char *host + SvOK($v{timep}) ? SvPV_nolen($arg) : NULL; + char *host + SvOK($v{timep}) ? SvPVbyte_nolen($arg) : NULL; OUTPUT: timep @@ -993,7 +993,7 @@ The XS code, with ellipsis, follows. char *host = "localhost"; CODE: if( items > 1 ) - host = (char *)SvPV_nolen(ST(1)); + host = (char *)SvPVbyte_nolen(ST(1)); RETVAL = rpcb_gettime( host, &timep ); OUTPUT: timep @@ -1294,7 +1294,7 @@ prototypes. char *host = "localhost"; CODE: if( items > 1 ) - host = (char *)SvPV_nolen(ST(1)); + host = (char *)SvPVbyte_nolen(ST(1)); RETVAL = rpcb_gettime( host, &timep ); OUTPUT: timep diff --git a/dist/ExtUtils-ParseXS/lib/perlxstut.pod b/dist/ExtUtils-ParseXS/lib/perlxstut.pod index 8e1372167073..fcafa58a81f8 100644 --- a/dist/ExtUtils-ParseXS/lib/perlxstut.pod +++ b/dist/ExtUtils-ParseXS/lib/perlxstut.pod @@ -1143,7 +1143,8 @@ Mytest.xs: for (n = 0; n <= numpaths; n++) { HV * rh; STRLEN l; - char * fn = SvPV(*av_fetch((AV *)SvRV(paths), n, 0), l); + SV * path = *av_fetch((AV *)SvRV(paths), n, 0); + char * fn = SvPVbyte(path, l); i = statfs(fn, &buf); if (i != 0) { diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 1f53eee14219..735e28a9ffe4 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -143,8 +143,13 @@ XXX =head1 Documentation -XXX Changes to files in F go here. Consider grouping entries by -file and be sure to link to the appropriate page, e.g. L. +L now explains in greater detail the need to consult SvUTF8 +when calling SvPV (or variants). A new "How do I pass a Perl string to a C +library?" section in the same document discusses when to use which style of +macro to read an SV's string value. + +L, L, L, and L now prefer SvPVbyte +over SvPV. =head2 New Documentation diff --git a/pod/perlguts.pod b/pod/perlguts.pod index 8d0b7894f07a..f1fd7da34af7 100644 --- a/pod/perlguts.pod +++ b/pod/perlguts.pod @@ -153,27 +153,74 @@ Perl's own functions typically add a trailing C for this reason. Nevertheless, you should be very careful when you pass a string stored in an SV to a C function or system call. -To access the actual value that an SV points to, you can use the macros: - - SvIV(SV*) - SvUV(SV*) - SvNV(SV*) - SvPV(SV*, STRLEN len) - SvPV_nolen(SV*) - -which will automatically coerce the actual scalar type into an IV, UV, double, -or string. - -In the C macro, the length of the string returned is placed into the -variable C (this is a macro, so you do I use C<&len>). If you do -not care what the length of the data is, use the C macro. -Historically the C macro with the global variable C has been -used in this case. But that can be quite inefficient because C must +To access the actual value that an SV points to, Perl's API exposes +several macros that coerce the actual scalar type into an IV, UV, double, +or string: + +=over + +=item * C (C) and C (C) + +=item * C (C) + +=item * Strings are a bit complicated: + +=over + +=item * Byte string: C or C + +If the Perl string is C<"\xff\xff">, then this returns a 2-byte C. + +This is suitable for Perl strings that represent bytes. + +=item * UTF-8 string: C or C + +If the Perl string is C<"\xff\xff">, then this returns a 4-byte C. + +This is suitable for Perl strings that represent characters. + +B: That C will be encoded via Perl's internal UTF-8 variant, +which means that if the SV contains non-Unicode code points (e.g., +0x110000), then the result may contain extensions over valid UTF-8. +See L for some methods Perl gives +you to check the UTF-8 validity of these macros' returns. + +=item * You can also use C or C +to fetch the SV's raw internal buffer. This is tricky, though; if your Perl +string +is C<"\xff\xff">, then depending on the SV's internal encoding you might get +back a 2-byte B a 4-byte C. +Moreover, if it's the 4-byte string, that could come from either Perl +C<"\xff\xff"> stored UTF-8 encoded, or Perl C<"\xc3\xbf\xc3\xbf"> stored +as raw octets. To differentiate between these you B look up the +SV's UTF8 bit (cf. C) to know whether the source Perl string +is 2 characters (C would be on) or 4 characters (C would be +off). + +B Use of C, C, or +similarly-named macros I looking up the SV's UTF8 bit is +almost certainly a bug if non-ASCII input is allowed. + +When the UTF8 bit is on, the same B about UTF-8 validity applies +here as for C. + +=back + +(See L for more details.) + +In C, C, and C, the length of the C returned +is placed into the +variable C (these are macros, so you do I use C<&len>). If you do +not care what the length of the data is, use C, +C, or C instead. +The global variable C can also be given to +C/C/C +in this case. But that can be quite inefficient because C must be accessed in thread-local storage in threaded Perl. In any case, remember that Perl allows arbitrary strings of data that may both contain NULs and might not be terminated by a C. -Also remember that C doesn't allow you to safely say C. It might work with your compiler, but it won't work for everyone. Break this sort of statement up into separate assignments: @@ -181,9 +228,11 @@ Break this sort of statement up into separate assignments: SV *s; STRLEN len; char *ptr; - ptr = SvPV(s, len); + ptr = SvPVbyte(s, len); foo(ptr, len); +=back + If you want to know if the scalar value is TRUE, you can use: SvTRUE(SV*) @@ -200,7 +249,7 @@ add space for the trailing C byte (perl's own string functions typically do C). If you want to write to an existing SV's buffer and set its value to a -string, use SvPV_force() or one of its variants to force the SV to be +string, use SvPVbyte_force() or one of its variants to force the SV to be a PV. This will remove any of various types of non-stringness from the SV while preserving the content of the SV in the PV. This can be used, for example, to append data from an API function to a buffer @@ -3243,6 +3292,66 @@ There is no published API for dealing with this, as it is subject to change, but you can look at the code for C in F for an example as to how it's currently done. +=head2 How do I pass a Perl string to a C library? + +A Perl string, conceptually, is an opaque sequence of code points. +Many C libraries expect their inputs to be "classical" C strings, which are +arrays of octets 1-255, terminated with a NUL byte. Your job when writing +an interface between Perl and a C library is to define the mapping between +Perl and that library. + +Generally speaking, C and related macros suit this task well. +These assume that your Perl string is a "byte string", i.e., is either +raw, undecoded input into Perl or is pre-encoded to, e.g., UTF-8. + +Alternatively, if your C library expects UTF-8 text, you can use +C and related macros. This has the same effect as encoding +to UTF-8 then calling the corresponding C-related macro. + +Some C libraries may expect other encodings (e.g., UTF-16LE). To give +Perl strings to such libraries +you must either do that encoding in Perl then use C, or +use an intermediary C library to convert from however Perl stores the +string to the desired encoding. + +Take care also that NULs in your Perl string don't confuse the C +library. If possible, give the string's length to the C library; if that's +not possible, consider rejecting strings that contain NUL bytes. + +=head3 What about C, C, etc.? + +Consider a 3-character Perl string C<$foo = "\x64\x78\x8c">. +Perl can store these 3 characters either of two ways: + +=over + +=item * bytes: 0x64 0x78 0x8c + +=item * UTF-8: 0x64 0x78 0xc2 0x8c + +=back + +Now let's say you convert C<$foo> to a C string thus: + + STRLEN strlen; + char *str = SvPV(foo_sv, strlen); + +At this point C could point to a 3-byte C string or a 4-byte one. + +Generally speaking, we want C to be the same regardless of how +Perl stores C<$foo>, so the ambiguity here is undesirable. C +and C solve that by giving predictable output: use +C if your C library expects byte strings, or C +if it expects UTF-8. + +If your C library happens to support both encodings, then C--always +in tandem with lookups to C!--may be safe and (slightly) more +efficient. + +B B Use L's C and C functions +in your tests to ensure consistent handling regardless of Perl's +internal encoding. + =head2 How do I convert a string to UTF-8? If you're mixing UTF-8 and non-UTF-8 strings, it is necessary to upgrade diff --git a/sv.h b/sv.h index ba701ed1a6f9..240986ca33cf 100644 --- a/sv.h +++ b/sv.h @@ -801,7 +801,9 @@ compiler will complain if you were to try to modify the contents of the string, (unless you cast away const yourself). =for apidoc Am|STRLEN|SvCUR|SV* sv -Returns the length of the string which is in the SV. See C>. +Returns the length, in bytes, of the PV inside the SV. +Note that this may not match Perl's C; for that, use +C. See C> also. =for apidoc Am|STRLEN|SvLEN|SV* sv Returns the size of the string buffer in the SV, not including any part @@ -855,8 +857,8 @@ Set the value of the MAGIC pointer in C to val. See C>. Set the value of the STASH pointer in C to val. See C>. =for apidoc Am|void|SvCUR_set|SV* sv|STRLEN len -Set the current length of the string which is in the SV. See C> -and C>. +Sets the current length, in bytes, of the C string which is in the SV. +See C> and C>. =for apidoc Am|void|SvLEN_set|SV* sv|STRLEN len Set the size of the string buffer for the SV. See C>. @@ -1657,6 +1659,14 @@ see C>. The differences between the forms are: +The forms with neither C nor C in their names (e.g., C or +C) can expose the SV's internal string buffer. If +that buffer consists entirely of bytes 0-255 and includes any bytes above +127, then you B consult C to determine the actual code points +the string is meant to contain. Generally speaking, it is probably safer to +prefer C, C, and the like. See +L for more details. + The forms with C in their names allow you to use the C parameter to specify to process 'get' magic (by setting the C flag) or to skip 'get' magic (by clearing it). The other forms process 'get' magic, except for