Permalink
Browse files

Use the Hoehrmann UTF-8 decoder

  • Loading branch information...
khwilliamson committed Oct 31, 2017
1 parent ad341a1 commit 0f5a25f7fd6b73508b45f06bd910cb4058deb337
Showing with 210 additions and 11 deletions.
  1. +15 −0 embed.fnc
  2. +7 −1 embed.h
  3. +138 −0 inline.h
  4. +1 −1 locale.c
  5. +15 −3 proto.h
  6. +1 −1 regcomp.c
  7. +33 −5 utf8.c
View
@@ -1812,11 +1812,26 @@ Adop |UV |utf8n_to_uvchr |NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags
#ifdef EBCDIC
Adp |UV |utf8n_to_uvchr_error|NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags \
|NULLOK U32 * errors
#else
: On ASCII, inline the easy part, use helper fcn for rest
Aidp |UV |utf8n_to_uvchr_error|NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags \
|NULLOK U32 * errors
AdpM |UV |_utf8n_to_uvchr_error|NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags \
|NULLOK U32 * errors \
|const UV codepoint
#endif
AipnR |UV |valid_utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
Ap |UV |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
View
@@ -742,7 +742,6 @@
#define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
#define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b)
#define utf8_to_uvuni_buf(a,b,c) Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
#define utf8n_to_uvchr_error(a,b,c,d,e) Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
#define uvoffuni_to_utf8_flags(a,b,c) Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b)
@@ -773,6 +772,10 @@
#define whichsig_pvn(a,b) Perl_whichsig_pvn(aTHX_ a,b)
#define whichsig_sv(a) Perl_whichsig_sv(aTHX_ a)
#define wrap_op_checker(a,b,c) Perl_wrap_op_checker(aTHX_ a,b,c)
#if !(defined(EBCDIC))
#define _utf8n_to_uvchr_error(a,b,c,d,e,f) Perl__utf8n_to_uvchr_error(aTHX_ a,b,c,d,e,f)
#define utf8n_to_uvchr_error(a,b,c,d,e) Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
#endif
#if !(defined(HAS_MEMMEM))
#define ninstr Perl_ninstr
#endif
@@ -810,6 +813,9 @@
#define pad_setsv(a,b) Perl_pad_setsv(aTHX_ a,b)
#define pad_sv(a) Perl_pad_sv(aTHX_ a)
#endif
#if defined(EBCDIC)
#define utf8n_to_uvchr_error(a,b,c,d,e) Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
#endif
#if defined(HAS_SIGACTION) && defined(SA_SIGINFO)
#define csighandler Perl_csighandler
#endif
View
138 inline.h
@@ -278,6 +278,142 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest)
}
}
#ifndef EBCDIC
/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
#define PERL_UTF8_DECODE_REJECT 12
static const U8 utf8d_C9[] = {
/* The first part of the table maps bytes to character classes that
* to reduce the size of the transition table and create bitmasks. */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
/* The second part is a transition table that maps a combination
* of a state of the automaton and a character class to a state. */
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};
static const U8 utf8d_with_surrogates[] = {
/* The first part of the table maps bytes to character classes to reduce
* the size of the transition table and create bitmasks. */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
/* The second part is a transition table that maps a combination
* of a state of the automaton and a character class to a state. */
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};
PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U32 * errors)
{
const U8 * local_s = s;
const U8 * send = s + curlen;
UV cp;
UV state = 0;
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
if (UNLIKELY(curlen == 0)) {
return _utf8n_to_uvchr_error(s, curlen, retlen, flags, errors, 0);
}
do {
UV type = utf8d_with_surrogates[*local_s];
if (state != 0) {
cp = (*local_s & 0x3fu) | (cp << 6);
state = utf8d_with_surrogates[256 + state + type];
}
else {
cp = (0xff >> type) & (*local_s);
state = utf8d_with_surrogates[256 + type];
}
local_s++;
if ( UNLIKELY(state == PERL_UTF8_DECODE_REJECT)
|| UNLIKELY(local_s >= send && state != 0))
{
return _utf8n_to_uvchr_error(s, curlen, retlen, flags, errors, 0);
}
} while (state != 0);
/* If this could be a code point that the flags don't allow (the first
* surrogate is the first such possible one), delve further, but we already
* have calculated 'cp' */
if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_WARN_ILLEGAL_INTERCHANGE))
&& cp >= UNICODE_SURROGATE_FIRST)
{
return _utf8n_to_uvchr_error(s, curlen, retlen, flags, errors, cp);
}
if (retlen) {
*retlen = local_s - s;
}
if (errors) {
*errors = 0;
}
return cp;
}
#undef PERL_UTF8_DECODE_REJECT
#endif
/*
=for apidoc valid_utf8_to_uvchr
Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
@@ -287,6 +423,8 @@ non-Unicode code points are allowed.
=cut
khw tried seeing if the dfa version above was faster, but it wasn't
*/
PERL_STATIC_INLINE UV
View
@@ -2703,7 +2703,7 @@ S_print_bytes_for_locale(pTHX_
while (t < e) {
UV cp = (is_utf8)
? utf8_to_uvchr_buf((U8 *) t, e, NULL)
? utf8_to_uvchr_buf((U8 *) t, (U8 *) e, NULL)
: * (U8 *) t;
if (isPRINT(cp)) {
if (! prev_was_printable) {
View
18 proto.h
@@ -3642,9 +3642,6 @@ PERL_CALLCONV UV Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLE
PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \
assert(s)
PERL_CALLCONV UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \
assert(s)
PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI \
assert(s)
@@ -3788,6 +3785,16 @@ STATIC int S_sv_2iuv_non_preserve(pTHX_ SV *const sv);
# endif
# endif
#endif
#if !(defined(EBCDIC))
PERL_CALLCONV UV Perl__utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors, const UV codepoint);
#define PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_ERROR \
assert(s)
#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \
assert(s)
#endif
#endif
#if !(defined(HAS_MEMMEM))
PERL_CALLCONV char* Perl_ninstr(const char* big, const char* bigend, const char* little, const char* lend)
__attribute__warn_unused_result__
@@ -4193,6 +4200,11 @@ PERL_CALLCONV void Perl_dump_sv_child(pTHX_ SV *sv);
#define PERL_ARGS_ASSERT_DUMP_SV_CHILD \
assert(sv)
#endif
#if defined(EBCDIC)
PERL_CALLCONV UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \
assert(s)
#endif
#if defined(HAS_MEMMEM)
PERL_CALLCONV char* Perl_ninstr(const char* big, const char* bigend, const char* little, const char* lend)
__attribute__warn_unused_result__
View
@@ -14520,7 +14520,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
p++;
}
else {
input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, (U8 *) e, NULL);
p+= UTF8SKIP(p);
}
View
38 utf8.c
@@ -1272,11 +1272,20 @@ flag to suppress any warnings, and then examine the C<*errors> return.
*/
UV
Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U32 * errors)
#ifdef EBCDIC
Perl_utf8n_to_uvchr_error
#else
Perl__utf8n_to_uvchr_error
#endif
(pTHX_ const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U32 * errors
#ifndef EBCDIC
, const UV precalc_uv
#endif
)
{
const U8 * const s0 = s;
U8 * send = NULL; /* (initialized to silence compilers' wrong
@@ -1299,7 +1308,11 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
routine; see [perl #130921] */
UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
#ifdef EBCDIC
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
#else
PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_ERROR;
#endif
if (errors) {
*errors = 0;
@@ -1308,6 +1321,19 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
errors = &discard_errors;
}
#ifndef EBCDIC
if (precalc_uv) {
uv = precalc_uv;
curlen = UVCHR_SKIP(uv);
if (retlen) {
*retlen = curlen;
}
goto got_codepoint;
}
#endif
/* The order of malformation tests here is important. We should consume as
* few bytes as possible in order to not skip any valid character. This is
* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
@@ -1478,6 +1504,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
}
}
got_codepoint:
/* Here, we have found all the possible problems, except for when the input
* is for a problematic code point not allowed by the input parameters. */

0 comments on commit 0f5a25f

Please sign in to comment.