From d091ac8d7cae228c8afecab4ba4aec9a770ea8a8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 18 Mar 2021 10:03:20 -0600 Subject: [PATCH] handy.h: Add wrapper layer macros for isalnum() ... This adds a new set of macros, forming a lower layer to what is currently there to wrap the character classification libc functions, isdigit() etc, and case changing ones, tolower(), toupper(). On most platforms these expand simply to the libc function call. But on windows, they expand to something more complex, to bring the Windows calls into POSIX compliance. Previously that was achieved at the higher level, with the result that lower level calls were broken. This resulted in parts of the test suite being skipped on Windows. The current level is rewritten to use the new lower layer, with the result that it is simpler, as the complexity is now done further down. I thought about calling these macros is_porcelain_isalnum or something similar to emphaisze that they are close to the bare libc version, but thought isU8_alnum() is shorter and conveys another truth, that being the input is assumed to be a byte, without checking. --- handy.h | 103 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/handy.h b/handy.h index 1e331094684b..8dfa9d3e64eb 100644 --- a/handy.h +++ b/handy.h @@ -1887,23 +1887,27 @@ END_EXTERN_C /* Use the libc versions for these if available. */ #if defined(HAS_ISASCII) -# define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c))) +# define isU8_ASCII_LC(c) isascii((U8) (c)) #else -# define isASCII_LC(c) isASCII(c) +# define isU8_ASCII_LC(c) isASCII(c) #endif #if defined(HAS_ISBLANK) -# define isBLANK_LC(c) generic_LC_(c, CC_BLANK_, isblank) -#else /* Unlike isASCII, varies if in a UTF-8 locale */ -# define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c)) +# define isU8_BLANK_LC(c) isblank((U8) (c)) +#else +# define isU8_BLANK_LC(c) isBLANK(c) #endif -# define isCNTRL_LC(c) generic_LC_(c, CC_CNTRL_, iscntrl) -# define isSPACE_LC(c) generic_LC_(c, CC_SPACE_, isspace) -# define isIDFIRST_LC(c) (UNLIKELY((c) == '_') || isALPHA_LC(c)) -# define isWORDCHAR_LC(c) (UNLIKELY((c) == '_') || isALPHANUMERIC_LC(c)) +/* The next few are the same in all platforms. */ +#define isU8_CNTRL_LC(c) iscntrl((U8) (c)) +#define isU8_SPACE_LC(c) isspace((U8) (c)) +#define isU8_IDFIRST_LC(c) (UNLIKELY((c) == '_') || isU8_ALPHA_LC(c)) +#define isU8_WORDCHAR_LC(c) (UNLIKELY((c) == '_') || isU8_ALPHANUMERIC_LC(c)) -# ifdef WIN32 +/* The base-leve case changing macros are also the same in all platforms */ +#define toU8_LOWER_LC(c) tolower((U8) (c)) +#define toU8_UPPER_LC(c) toupper((U8) (c)) +#define toU8_FOLD_LC(c) toU8_LOWER_LC(c) /* The Windows functions don't bother to follow the POSIX standard, which for * example says that something can't both be a printable and a control. But @@ -1915,40 +1919,51 @@ END_EXTERN_C * controls, and that \w and its subsets aren't ispunct(). Not all possible * weirdnesses are checked for, just ones that were detected on actual * Microsoft code pages */ +#ifdef WIN32 +# define isU8_ALPHA_LC(c) (isalpha((U8) (c)) && isU8_ALPHANUMERIC_LC(c)) +# define isU8_ALPHANUMERIC_LC(c) (isalnum((U8) (c)) && ! isU8_PUNCT_LC(c)) +# define isU8_DIGIT_LC(c) (isdigit((U8) (c)) && isU8_ALPHANUMERIC_LC(c)) +# define isU8_GRAPH_LC(c) (isgraph((U8) (c)) && isU8_PRINT_LC(c)) +# define isU8_LOWER_LC(c) (islower((U8) (c)) && isU8_ALPHA_LC(c)) +# define isU8_PRINT_LC(c) (isprint((U8) (c)) && ! isU8_CNTRL_LC(c)) +# define isU8_PUNCT_LC(c) (ispunct((U8) (c)) && ! isU8_CNTRL_LC(c)) +# define isU8_UPPER_LC(c) (isupper((U8) (c)) && isU8_ALPHA_LC(c)) +# define isU8_XDIGIT_LC(c) (isxdigit((U8)(c)) && isU8_ALPHANUMERIC_LC(c)) +#else -# define isALPHA_LC(c) (generic_LC_(c, CC_ALPHA_, isalpha) \ - && isALPHANUMERIC_LC(c)) -# define isALPHANUMERIC_LC(c) (generic_LC_(c, CC_ALPHANUMERIC_, isalnum) \ - && ! isPUNCT_LC(c)) -# define isDIGIT_LC(c) (generic_LC_(c, CC_DIGIT_, isdigit) \ - && isALPHANUMERIC_LC(c)) -# define isGRAPH_LC(c) (generic_LC_(c, CC_GRAPH_, isgraph) \ - && isPRINT_LC(c)) -# define isLOWER_LC(c) (generic_LC_(c, CC_LOWER_, islower) \ - && isALPHA_LC(c)) -# define isPRINT_LC(c) (generic_LC_(c, CC_PRINT_, isprint) \ - && ! isCNTRL_LC(c)) -# define isPUNCT_LC(c) (generic_LC_(c, CC_PUNCT_, ispunct) \ - && ! isCNTRL_LC(c)) -# define isUPPER_LC(c) (generic_LC_(c, CC_UPPER_, isupper) \ - && isALPHA_LC(c)) -# define isXDIGIT_LC(c) (generic_LC_(c, CC_XDIGIT_, isxdigit) \ - && isALPHANUMERIC_LC(c)) -# else +/* For all other platforms, as far as we know, isdigit(), etc. work sanely + * enough */ + +# define isU8_ALPHA_LC(c) isalpha((U8) (c)) +# define isU8_ALPHANUMERIC_LC(c) isalnum((U8) (c)) +# define isU8_DIGIT_LC(c) isdigit((U8) (c)) +# define isU8_GRAPH_LC(c) isgraph((U8) (c)) +# define isU8_LOWER_LC(c) islower((U8) (c)) +# define isU8_PRINT_LC(c) isprint((U8) (c)) +# define isU8_PUNCT_LC(c) ispunct((U8) (c)) +# define isU8_UPPER_LC(c) isupper((U8) (c)) +# define isU8_XDIGIT_LC(c) isxdigit((U8) (c)) +#endif -/* For all other platforms with, as far as we know, sane locales that the - * isdigit(), etc functions operate on */ - -# define isALPHA_LC(c) generic_LC_(c, CC_ALPHA_, isalpha) -# define isALPHANUMERIC_LC(c) generic_LC_(c, CC_ALPHANUMERIC_, isalnum) -# define isDIGIT_LC(c) generic_LC_(c, CC_DIGIT_, isdigit) -# define isGRAPH_LC(c) generic_LC_(c, CC_GRAPH_, isgraph) -# define isLOWER_LC(c) generic_LC_(c, CC_LOWER_, islower) -# define isPRINT_LC(c) generic_LC_(c, CC_PRINT_, isprint) -# define isPUNCT_LC(c) generic_LC_(c, CC_PUNCT_, ispunct) -# define isUPPER_LC(c) generic_LC_(c, CC_UPPER_, isupper) -# define isXDIGIT_LC(c) generic_LC_(c, CC_XDIGIT_, isxdigit) -# endif +/* The definitions below use the ones above to take a general input domain + * (though always returning false if the input doesn't fit in a byte, and to + * behave properly should the locale be UTF-8 */ +#define isASCII_LC(c) (FITS_IN_8_BITS(c) && isU8_ASCII_LC(c)) +#define isALPHA_LC(c) generic_LC_(c, CC_ALPHA_, isU8_ALPHA_LC) +#define isALPHANUMERIC_LC(c) \ + generic_LC_(c, CC_ALPHANUMERIC_, isU8_ALPHANUMERIC_LC) +#define isBLANK_LC(c) generic_LC_(c, CC_BLANK_, isU8_BLANK_LC) +#define isCNTRL_LC(c) generic_LC_(c, CC_CNTRL_, isU8_CNTRL_LC) +#define isDIGIT_LC(c) generic_LC_(c, CC_DIGIT_, isU8_DIGIT_LC) +#define isGRAPH_LC(c) generic_LC_(c, CC_GRAPH_, isU8_GRAPH_LC) +#define isIDFIRST_LC(c) generic_LC_(c, CC_IDFIRST_, isU8_IDFIRST_LC) +#define isLOWER_LC(c) generic_LC_(c, CC_LOWER_, isU8_LOWER_LC) +#define isPRINT_LC(c) generic_LC_(c, CC_PRINT_, isU8_PRINT_LC) +#define isPUNCT_LC(c) generic_LC_(c, CC_PUNCT_, isU8_PUNCT_LC) +#define isSPACE_LC(c) generic_LC_(c, CC_SPACE_, isU8_SPACE_LC) +#define isUPPER_LC(c) generic_LC_(c, CC_UPPER_, isU8_UPPER_LC) +#define isWORDCHAR_LC(c) generic_LC_(c, CC_WORDCHAR_, isU8_WORDCHAR_LC) +#define isXDIGIT_LC(c) generic_LC_(c, CC_XDIGIT_, isU8_XDIGIT_LC) #ifndef CTYPE256 # define toLOWER_LC(c) toLOWER_A(c) @@ -1965,7 +1980,7 @@ END_EXTERN_C ? (c) \ : ((IN_UTF8_CTYPE_LOCALE) \ ? PL_latin1_lc[ (U8) (c) ] \ - : (U8) tolower((U8) (c)))) + : ((U8) toU8_LOWER_LC(c)))) /* In this macro, note that the result can be larger than a byte in a UTF-8 * locale. It returns a single value, so can't adequately return the upper @@ -1977,7 +1992,7 @@ END_EXTERN_C ((! FITS_IN_8_BITS(c)) \ ? (c) \ : ((! IN_UTF8_CTYPE_LOCALE) \ - ? (U8) toupper((U8) (c)) \ + ? ((U8) toU8_UPPER_LC(c)) \ : (UNLIKELY(((U8)(c)) == MICRO_SIGN) \ ? GREEK_CAPITAL_LETTER_MU \ : ((UNLIKELY(((U8) (c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \