Skip to content

Commit

Permalink
Reimplement OFFUNISKIP
Browse files Browse the repository at this point in the history
Now that previous commits have made it fast to find the position of the
first set bit in a word, we can use a forumla to find how many bytes the
UTF-8 of that will occupy.  This allows for simplification of this
macro, removing several conditionals
  • Loading branch information
khwilliamson committed Jun 14, 2021
1 parent 02bd4e7 commit 7d67d9e
Showing 1 changed file with 28 additions and 46 deletions.
74 changes: 28 additions & 46 deletions utf8.h
Expand Up @@ -313,55 +313,37 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
*/
#define UVCHR_IS_INVARIANT(cp) (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp)))

/* Internal macro to be used only in this file to aid in constructing other
* publicly accessible macros.
* The number of bytes required to express this uv in UTF-8, for just those
* uv's requiring 2 through 6 bytes, as these are common to all platforms and
* word sizes. The number of bytes needed is given by the number of leading 1
* bits in the start byte. There are 32 start bytes that have 2 initial 1 bits
* (C0-DF); there are 16 that have 3 initial 1 bits (E0-EF); 8 that have 4
* initial 1 bits (F0-F8); 4 that have 5 initial 1 bits (F9-FB), and 2 that
* have 6 initial 1 bits (FC-FD). The largest number a string of n bytes can
* represent is (the number of possible start bytes for 'n')
* * (the number of possiblities for each start byte
* The latter in turn is
* 2 ** ( (how many continuation bytes there are)
* * (the number of bits of information each
* continuation byte holds))
/* The arithmetic below breaks down for small code points, and even if it
* didn't, the algorithms in my_msbit_pos() generally require the input to be
* non-zero, so would need to special case NUL. By using all the invariants,
* no extra conditionals are used, and we get past the point where the
* algorithms break, and on EBCDIC boxes, the fact the the invariants/two-byte
* code points are mixed falls out automatically.
*
* If we were on a platform where we could use a fast find first set bit
* instruction (or count leading zeros instruction) this could be replaced by
* using that to find the log2 of the uv, and divide that by the number of bits
* of information in each continuation byte, adjusting for large cases and how
* much information is in a start byte for that length */
#define __COMMON_UNI_SKIP(uv) \
(UV) (uv) < (32 * (1U << ( UTF_ACCUMULATION_SHIFT))) ? 2 : \
(UV) (uv) < (16 * (1U << (2 * UTF_ACCUMULATION_SHIFT))) ? 3 : \
(UV) (uv) < ( 8 * (1U << (3 * UTF_ACCUMULATION_SHIFT))) ? 4 : \
(UV) (uv) < ( 4 * (1U << (4 * UTF_ACCUMULATION_SHIFT))) ? 5 : \
(UV) (uv) < ( 2 * (1U << (5 * UTF_ACCUMULATION_SHIFT))) ? 6 :

/* Internal macro to be used only in this file.
* This adds to __COMMON_UNI_SKIP the details at this platform's upper range.
* For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test
* to see if just 7 bytes is needed, or if the maximum is needed. For 32-bit
* ASCII platforms, everything is representable by 7 bytes */
#if defined(UV_IS_QUAD) || defined(EBCDIC)
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \
LIKELY((UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT))) \
? 7 \
: UTF8_MAXBYTES)
* We need a new continuation byte for every increase in the size of 'uv' by
* (UTF_CONTINUATION_BYTE_INFO_BITS - 1). The -1 is because each new byte
* removes one bit of information from the start byte.
*/
#define OFFUNISKIP(uv) \
((OFFUNI_IS_INVARIANT(uv)) \
? 1 \
: ((OFFUNISKIP_helper_(uv)) \
? UTF8_MAXBYTES \
: (my_msbit_pos(uv) + (UTF_CONTINUATION_BYTE_INFO_BITS - 1) - 1) \
/ (UTF_CONTINUATION_BYTE_INFO_BITS - 1)))

/* We need to go to MAXBYTES when we can't represent 'uv' by the number of
* information bits in 6 continuation bytes (when we get to 6, the start byte
* has no information bits to add to the total). But on ASCII platforms, that
* doesn't happen until 6*6 bits, which is above the 32-bit word size, so on
* those platforms, this will always be false */
#if UVSIZE * CHARBITS > (6 * UTF_CONTINUATION_BYTE_INFO_BITS)
# define OFFUNISKIP_helper_(uv) \
UNLIKELY(uv > nBIT_UMAX(6 * UTF_CONTINUATION_BYTE_INFO_BITS))
#else
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
# define OFFUNISKIP_helper_(uv) 0
#endif

/* The next two macros use the base macro defined above, and add in the tests
* at the low-end of the range, for just 1 byte, yielding complete macros,
* publicly accessible. */

/* Input is a true Unicode (not-native) code point */
#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))

/*
=for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
Expand All @@ -371,7 +353,7 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
=cut
*/
#define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
#define UVCHR_SKIP(uv) OFFUNISKIP(NATIVE_TO_UNI(uv))

#define UTF_MIN_START_BYTE \
((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
Expand Down

0 comments on commit 7d67d9e

Please sign in to comment.