Skip to content

Commit

Permalink
utf8.h: Remove EBCDIC dependency
Browse files Browse the repository at this point in the history
By generalizing a macro, we can make it serve both ASCII and EBCDIC
  • Loading branch information
khwilliamson committed Aug 7, 2021
1 parent bdcc1e9 commit 6f8b1f9
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 12 deletions.
17 changes: 10 additions & 7 deletions utf8.h
Expand Up @@ -255,13 +255,6 @@ are in the character. */
* for more */
#define QUESTION_MARK_CTRL DEL_NATIVE

/* Surrogates, non-character code points and above-Unicode code points are
* problematic in some contexts. This allows code that needs to check for
* those to quickly exclude the vast majority of code points it will
* encounter */
#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(U8) c >= 0xED)

#endif /* EBCDIC vs ASCII */

/* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a
Expand Down Expand Up @@ -858,6 +851,16 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
&& _is_in_locale_category(FALSE, -1))) \
&& (! IN_BYTES))

/* Surrogates, non-character code points and above-Unicode code points are
* problematic in some contexts. These macros allow code that needs to check
* for those to quickly exclude the vast majority of code points it will
* encounter.
*
* The lowest such code point is the smallest surrogate, U+D800. We calculate
* the start byte of that. 0xD800 occupies 16 bits. */
#define isUNICODE_POSSIBLY_PROBLEMATIC(uv) ((uv) >= UNICODE_SURROGATE_FIRST)
#define isUTF8_POSSIBLY_PROBLEMATIC(c) \
(NATIVE_UTF8_TO_I8(c) >= UTF_START_BYTE(UNICODE_SURROGATE_FIRST, 16))

/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
* UTF-EBCDIC) any 64-bit value. No standard known to khw ever encoded higher
Expand Down
5 changes: 0 additions & 5 deletions utfebcdic.h
Expand Up @@ -199,11 +199,6 @@ explicitly forbidden, and the shortest possible encoding should always be used

#define UTF_CONTINUATION_BYTE_INFO_BITS UTF_EBCDIC_CONTINUATION_BYTE_INFO_BITS

/* These others are for efficiency or for other decisions we've made */

#define isUTF8_POSSIBLY_PROBLEMATIC(c) \
_generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)

/* ^? is defined to be APC on EBCDIC systems, as specified in Unicode Technical
* Report #16. See the definition of toCTRL() for more */
#define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F)
Expand Down

0 comments on commit 6f8b1f9

Please sign in to comment.