From bd78ed18fc611992ba1dcdc81eb12d5cc3e88009 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 14 Jun 2021 06:04:44 -0600 Subject: [PATCH] utf8.h: Add symbol for easing EBCDIC handling This is then used in regcomp.c to avoid an #ifdef EBCDIC --- regcomp.c | 11 ++++++----- utf8.h | 5 +++++ utfebcdic.h | 1 + 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index e7e83b737b21..b89f701cc273 100644 --- a/regcomp.c +++ b/regcomp.c @@ -19815,11 +19815,12 @@ S_optimize_regclass(pTHX_ * invariant bytes, because they have the same bit patterns under UTF-8 * as not. */ PERL_UINT_FAST8_T inverted = 0; -#ifdef EBCDIC - const PERL_UINT_FAST8_T max_permissible = 0xFF; -#else - const PERL_UINT_FAST8_T max_permissible = 0x7F; -#endif + + /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on + * EBCDIC */ + const PERL_UINT_FAST8_T max_permissible + = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT); + /* If doesn't fit the criteria for ANYOFM, invert and try again. If * that works we will instead later generate an NANYOFM, and invert * back when through */ diff --git a/utf8.h b/utf8.h index ce7f57fcceb7..fccc95d33135 100644 --- a/utf8.h +++ b/utf8.h @@ -265,6 +265,11 @@ are in the character. */ #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (U8) c >= 0xED) +/* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a + * matter of being off-by-one. So this is a convenience macro, used to avoid + * some #ifdefs. */ +# define ONE_IF_EBCDIC_ZERO_IF_NOT 0 + #define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF) #endif /* EBCDIC vs ASCII */ diff --git a/utfebcdic.h b/utfebcdic.h index f6a54ab5d6d9..d016641cbf82 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -228,6 +228,7 @@ explicitly forbidden, and the shortest possible encoding should always be used #define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x3FFFFFFF) +#define ONE_IF_EBCDIC_ZERO_IF_NOT 1 /* * ex: set ts=8 sts=4 sw=4 et: