Skip to content

Commit

Permalink
XXX flesh out commit msg: Search for UTF-8 invariants by word
Browse files Browse the repository at this point in the history
  • Loading branch information
khwilliamson committed Nov 16, 2017
1 parent 60fae40 commit c0905b8
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 15 deletions.
4 changes: 2 additions & 2 deletions embed.fnc
Expand Up @@ -778,9 +778,9 @@ ADMpR |bool |is_uni_print_lc|UV c
ADMpR |bool |is_uni_punct_lc|UV c
ADMpPR |bool |is_uni_xdigit_lc|UV c
AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \
|STRLEN const len
|STRLEN len
AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
|STRLEN const len \
|STRLEN len \
|NULLOK const U8 ** ep
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len
Expand Down
13 changes: 13 additions & 0 deletions ext/XS-APItest/APItest.xs
Expand Up @@ -6004,6 +6004,19 @@ test_is_utf8_string(char *s, STRLEN len)
OUTPUT:
RETVAL

AV *
test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len)
PREINIT:
AV *av;
const U8 * ep;
CODE:
av = newAV();
av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep)));
av_push(av, newSViv(ep - ((U8 *) s + offset)));
RETVAL = av;
OUTPUT:
RETVAL

AV *
test_is_utf8_string_loc(char *s, STRLEN len)
PREINIT:
Expand Down
25 changes: 25 additions & 0 deletions ext/XS-APItest/t/utf8.t
Expand Up @@ -15,6 +15,31 @@ $|=1;

use XS::APItest;


my $s = "A" x 100 ;
my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s);
is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant");

my $above_word_length = 9;
$|=1;
for my $initial (0 .. $above_word_length) {
for my $offset (0 .. $above_word_length) {
for my $trailing (0 .. $above_word_length) {
if ($initial >= $offset) {
my $variant_pos = $initial - $offset;
$s = "A" x $initial . "\x80" . "A" x $trailing;
#diag "initial=$initial; offset=$offset";
#diag $s;
#diag substr($s, $offset);
my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset, length $s);
is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns FALSE for variant at $variant_pos, first $offset ignored)");
#diag $ret_ref->[1];
is($ret_ref->[1], $variant_pos, " And returns the correct position");
}
}
}
}

my $pound_sign = chr utf8::unicode_to_native(163);

# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
Expand Down
81 changes: 70 additions & 11 deletions inline.h
Expand Up @@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>.
=cut
XXX On ASCII machines this could be sped up by doing word-at-a-time operations
*/

PERL_STATIC_INLINE bool
S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* send;
const U8* x = s;

PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;

while (x < send) {
if (UTF8_IS_INVARIANT(*x)) {
x++;
continue;
if (len == 0) {
len = strlen((const char *)s);
}

send = s + len;

#ifndef EBCDIC
/* Try to get the widest word on this platform */
# ifdef HAS_LONG_LONG
# define WORDCAST unsigned long long
# define WORDSIZE LONGLONGSIZE
# else
# define WORDCAST UV
# define WORDSIZE UVSIZE
# endif

# if WORDSIZE == 4
# define VARIANTS_WORD_MASK 0x80808080
# define WORD_BOUNDARY_MASK 0x3
# elif WORDSIZE == 8
# define VARIANTS_WORD_MASK 0x8080808080808080
# define WORD_BOUNDARY_MASK 0x7
# else
# error Unexpected word size
# endif

/* Process per-byte until reach word boundary. XXX This loop could be
* eliminated if we knew that this platform had fast unaligned reads */
while (x < send && (PTR2nat(x) & WORD_BOUNDARY_MASK)) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}

return FALSE;
}
x++;
}

/* Process per-word as long as we have at least a full word left */
while (x + WORDSIZE <= send) {
if ((* (WORDCAST *) x) & VARIANTS_WORD_MASK) {

/* Found a variant. Just return if caller doesn't want its exact
* position */
if (! ep) {
return FALSE;
}

if (ep) {
*ep = x;
/* Otherwise fall into final loop to find which byte it is */
break;
}
x += WORDSIZE;
}

return FALSE;
# undef WORDCAST
# undef WORDSIZE
# undef WORD_BOUNDARY_MASK
# undef VARIANTS_WORD_MASK
#endif

/* Process per-byte */
while (x < send) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}

return FALSE;
}

x++;
}

return TRUE;
Expand Down
4 changes: 2 additions & 2 deletions proto.h
Expand Up @@ -1626,11 +1626,11 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \
assert(p)

/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len)
/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__; */

#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep)
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \
assert(s)
Expand Down

0 comments on commit c0905b8

Please sign in to comment.