Permalink
Browse files

Search for UTF-8 invariants by word

The functions is_utf8_invariant_string() and
is_utf8_invariant_string_loc() are used in several places in the core
and are part of the public API.  This commit speeds them up
significantly on ASCII (not EBCDIC) platforms, by changing to use
word-at-a-time parsing instead of per-byte.  (Per-byte is retained for
any initial bytes to reach the next word boundary, and any final bytes
that don't fill an entire word.)

The following results were obtained parsing a long string on a 64-bit
word machine:

        byte   word
       ------ ------
    Ir 100.00 665.35
    Dr 100.00 797.03
    Dw 100.00 102.12
  COND 100.00 799.27
   IND 100.00  97.56

COND_m 100.00 144.83
 IND_m 100.00  75.00

 Ir_m1 100.00 100.00
 Dr_m1 100.00 100.02
 Dw_m1 100.00 104.12

 Ir_mm 100.00 100.00
 Dr_mm 100.00 100.00
 Dw_mm 100.00 100.00

100% is baseline; numbers larger than that are improvements.  The COND
measurement indicates, for example, that there 1/8 as many conditional
branches in the word-at-a-time version.
  • Loading branch information...
khwilliamson committed Nov 15, 2017
1 parent 46a08a6 commit e17544a60909ed9555c0dad7cd24afc40eb736e7
Showing with 112 additions and 16 deletions.
  1. +2 −2 embed.fnc
  2. +1 −1 ext/XS-APItest/APItest.pm
  3. +13 −0 ext/XS-APItest/APItest.xs
  4. +24 −0 ext/XS-APItest/t/utf8.t
  5. +70 −11 inline.h
  6. +2 −2 proto.h
View
@@ -778,9 +778,9 @@ ADMpR |bool |is_uni_print_lc|UV c
ADMpR |bool |is_uni_punct_lc|UV c
ADMpPR |bool |is_uni_xdigit_lc|UV c
AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \
|STRLEN const len
|STRLEN len
AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
|STRLEN const len \
|STRLEN len \
|NULLOK const U8 ** ep
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len
@@ -5,7 +5,7 @@ use strict;
use warnings;
use Carp;
our $VERSION = '0.93';
our $VERSION = '0.94';
require XSLoader;
View
@@ -6004,6 +6004,19 @@ test_is_utf8_string(char *s, STRLEN len)
OUTPUT:
RETVAL
AV *
test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len)
PREINIT:
AV *av;
const U8 * ep;
CODE:
av = newAV();
av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep)));
av_push(av, newSViv(ep - ((U8 *) s + offset)));
RETVAL = av;
OUTPUT:
RETVAL
AV *
test_is_utf8_string_loc(char *s, STRLEN len)
PREINIT:
View
@@ -15,6 +15,30 @@ $|=1;
use XS::APItest;
my $s = "A" x 100 ;
my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s);
is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant");
my $above_word_length = 9;
for my $initial (0 .. $above_word_length) {
for my $offset (0 .. $above_word_length) {
for my $trailing (0 .. $above_word_length) {
if ($initial >= $offset) {
my $variant_pos = $initial - $offset;
$s = "A" x $initial . "\x80" . "A" x $trailing;
my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset,
length $s);
is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns"
. " FALSE for variant at $variant_pos,"
. " first $offset ignored)");
is($ret_ref->[1], $variant_pos,
" And returns the correct position");
}
}
}
}
my $pound_sign = chr utf8::unicode_to_native(163);
# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
View
@@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>.
=cut
XXX On ASCII machines this could be sped up by doing word-at-a-time operations
*/
PERL_STATIC_INLINE bool
S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* send;
const U8* x = s;
PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
while (x < send) {
if (UTF8_IS_INVARIANT(*x)) {
x++;
continue;
if (len == 0) {
len = strlen((const char *)s);
}
send = s + len;
#ifndef EBCDIC
/* Try to get the widest word on this platform */
# ifdef HAS_LONG_LONG
# define PERL_WORDCAST unsigned long long
# define PERL_WORDSIZE LONGLONGSIZE
# else
# define PERL_WORDCAST UV
# define PERL_WORDSIZE UVSIZE
# endif
# if PERL_WORDSIZE == 4
# define PERL_VARIANTS_WORD_MASK 0x80808080
# define PERL_WORD_BOUNDARY_MASK 0x3
# elif PERL_WORDSIZE == 8
# define PERL_VARIANTS_WORD_MASK 0x8080808080808080
# define PERL_WORD_BOUNDARY_MASK 0x7
# else
# error Unexpected word size
# endif
/* Process per-byte until reach word boundary. XXX This loop could be
* eliminated if we knew that this platform had fast unaligned reads */
while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}
return FALSE;
}
x++;
}
/* Process per-word as long as we have at least a full word left */
while (x + PERL_WORDSIZE <= send) {
if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
/* Found a variant. Just return if caller doesn't want its exact
* position */
if (! ep) {
return FALSE;
}
if (ep) {
*ep = x;
/* Otherwise fall into final loop to find which byte it is */
break;
}
x += PERL_WORDSIZE;
}
return FALSE;
# undef PERL_WORDCAST
# undef PERL_WORDSIZE
# undef PERL_WORD_BOUNDARY_MASK
# undef PERL_VARIANTS_WORD_MASK
#endif
/* Process per-byte */
while (x < send) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}
return FALSE;
}
x++;
}
return TRUE;
View
@@ -1626,11 +1626,11 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \
assert(p)
/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len)
/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__; */
#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep)
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \
assert(s)

0 comments on commit e17544a

Please sign in to comment.