Permalink
Browse files

XXX flesh out commit msg: Search for UTF-8 invariants by word

  • Loading branch information...
khwilliamson committed Nov 15, 2017
1 parent 60fae40 commit c0905b8b454ac194abf559ae1ad3f436d42603b2
Showing with 112 additions and 15 deletions.
  1. +2 −2 embed.fnc
  2. +13 −0 ext/XS-APItest/APItest.xs
  3. +25 −0 ext/XS-APItest/t/utf8.t
  4. +70 −11 inline.h
  5. +2 −2 proto.h
View
@@ -778,9 +778,9 @@ ADMpR |bool |is_uni_print_lc|UV c
ADMpR |bool |is_uni_punct_lc|UV c
ADMpPR |bool |is_uni_xdigit_lc|UV c
AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \
|STRLEN const len
|STRLEN len
AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
|STRLEN const len \
|STRLEN len \
|NULLOK const U8 ** ep
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len
View
@@ -6004,6 +6004,19 @@ test_is_utf8_string(char *s, STRLEN len)
OUTPUT:
RETVAL
AV *
test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len)
PREINIT:
AV *av;
const U8 * ep;
CODE:
av = newAV();
av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep)));
av_push(av, newSViv(ep - ((U8 *) s + offset)));
RETVAL = av;
OUTPUT:
RETVAL
AV *
test_is_utf8_string_loc(char *s, STRLEN len)
PREINIT:
View
@@ -15,6 +15,31 @@ $|=1;
use XS::APItest;
my $s = "A" x 100 ;
my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s);
is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant");
my $above_word_length = 9;
$|=1;
for my $initial (0 .. $above_word_length) {
for my $offset (0 .. $above_word_length) {
for my $trailing (0 .. $above_word_length) {
if ($initial >= $offset) {
my $variant_pos = $initial - $offset;
$s = "A" x $initial . "\x80" . "A" x $trailing;
#diag "initial=$initial; offset=$offset";
#diag $s;
#diag substr($s, $offset);
my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset, length $s);
is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns FALSE for variant at $variant_pos, first $offset ignored)");
#diag $ret_ref->[1];
is($ret_ref->[1], $variant_pos, " And returns the correct position");
}
}
}
}
my $pound_sign = chr utf8::unicode_to_native(163);
# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
View
@@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>.
=cut
XXX On ASCII machines this could be sped up by doing word-at-a-time operations
*/
PERL_STATIC_INLINE bool
S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* send;
const U8* x = s;
PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
while (x < send) {
if (UTF8_IS_INVARIANT(*x)) {
x++;
continue;
if (len == 0) {
len = strlen((const char *)s);
}
send = s + len;
#ifndef EBCDIC
/* Try to get the widest word on this platform */
# ifdef HAS_LONG_LONG
# define WORDCAST unsigned long long
# define WORDSIZE LONGLONGSIZE
# else
# define WORDCAST UV
# define WORDSIZE UVSIZE
# endif
# if WORDSIZE == 4
# define VARIANTS_WORD_MASK 0x80808080
# define WORD_BOUNDARY_MASK 0x3
# elif WORDSIZE == 8
# define VARIANTS_WORD_MASK 0x8080808080808080
# define WORD_BOUNDARY_MASK 0x7
# else
# error Unexpected word size
# endif
/* Process per-byte until reach word boundary. XXX This loop could be
* eliminated if we knew that this platform had fast unaligned reads */
while (x < send && (PTR2nat(x) & WORD_BOUNDARY_MASK)) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}
return FALSE;
}
x++;
}
/* Process per-word as long as we have at least a full word left */
while (x + WORDSIZE <= send) {
if ((* (WORDCAST *) x) & VARIANTS_WORD_MASK) {
/* Found a variant. Just return if caller doesn't want its exact
* position */
if (! ep) {
return FALSE;
}
if (ep) {
*ep = x;
/* Otherwise fall into final loop to find which byte it is */
break;
}
x += WORDSIZE;
}
return FALSE;
# undef WORDCAST
# undef WORDSIZE
# undef WORD_BOUNDARY_MASK
# undef VARIANTS_WORD_MASK
#endif
/* Process per-byte */
while (x < send) {
if (! UTF8_IS_INVARIANT(*x)) {
if (ep) {
*ep = x;
}
return FALSE;
}
x++;
}
return TRUE;
View
@@ -1626,11 +1626,11 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \
assert(p)
/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len)
/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__; */
#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep)
PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \
assert(s)

0 comments on commit c0905b8

Please sign in to comment.