From 8a949e539a4501bbae40c157e82b1cd503cd191e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 25 May 2021 10:33:32 -0600 Subject: [PATCH 1/5] Dumper.xs: Port ESC_NATIVE back on EBCDIC Wrap its definition with LATIN1_TO_NATIVE() which will automatically port it back as far as possible. This is the only use of ESC_NATIVE on cpan, so I don't think it's worth putting it in ppport.h --- dist/Data-Dumper/Dumper.xs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index 150a33a21a1b..59458b948e7e 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -22,7 +22,7 @@ * calling this .xs file for releases where they aren't defined */ #ifndef ESC_NATIVE /* \e */ -# define ESC_NATIVE 27 +# define ESC_NATIVE LATIN1_TO_NATIVE(27) #endif /* SvPVCLEAR only from perl 5.25.6 */ From 3c44e7bf896a9c9794b0ed0cb97f4d710390d5d7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 25 May 2021 11:48:58 -0600 Subject: [PATCH 2/5] Dumper.xs: isDIGIT() is now in ppport.h So use it. This makes the code cleaner, and removes a conditional (unless the compiler optimizer figures out to do so anyway). --- dist/Data-Dumper/Dumper.xs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index 59458b948e7e..7c49c9e9cbe6 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -301,9 +301,7 @@ esc_q_utf8(pTHX_ SV* sv, const char *src, STRLEN slen, I32 do_utf8, I32 useqq) } else /* The other low ordinals are output as an octal escape * sequence */ - if (s + 1 >= send || ( *(U8*)(s+1) >= '0' - && *(U8*)(s+1) <= '9')) - { + if (s + 1 >= send || isDIGIT(*(s+1))) { /* When the following character is a digit, use 3 octal digits * plus backslash, as using fewer digits would concatenate the * following char into this one */ @@ -393,9 +391,7 @@ esc_q_utf8(pTHX_ SV* sv, const char *src, STRLEN slen, I32 do_utf8, I32 useqq) * since we only encode characters \377 and under, or * \x177 and under for a unicode string */ - next_is_digit = (s + 1 >= send ) - ? FALSE - : (*(U8*)(s+1) >= '0' && *(U8*)(s+1) <= '9'); + next_is_digit = (s + 1 < send && isDIGIT(*(s+1))); /* faster than * r = r + my_sprintf(r, "%o", k); From 5334c4b8ece84ec3e25bdf9069b70db38e0f88a1 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 25 May 2021 11:52:14 -0600 Subject: [PATCH 3/5] Dumper.xs: Revise calculation of needed size This changes the calculation of how large a buffer is needed to use a loop instead of a chain of conditionals. I think it is easier to read, certainly it is tidier, and has the added (small) benefit that it doesn't give up and choose the worst case scenario for large code points. --- dist/Data-Dumper/Dumper.xs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index 7c49c9e9cbe6..b5d7e39fc662 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -272,13 +272,11 @@ esc_q_utf8(pTHX_ SV* sv, const char *src, STRLEN slen, I32 do_utf8, I32 useqq) * first byte */ increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s); - grow += 4 + (k <= 0xFF ? 2 : k <= 0xFFF ? 3 : k <= 0xFFFF ? 4 : -#if UVSIZE == 4 - 8 /* We may allocate a bit more than the minimum here. */ -#else - k <= 0xFFFFFFFF ? 8 : UVSIZE * 4 -#endif - ); + grow += 6; /* Smallest we do is "\x{FF}" */ + k >>= 4; + while ((k >>= 4) != 0) { /* Add space for each nibble */ + grow++; + } } else if (useqq) { /* Not utf8, must be <= 0xFF, hence 2 hex * digits. */ From 030107e2f3fa9d534886ec9a53526071cc8a01bf Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 25 May 2021 12:08:06 -0600 Subject: [PATCH 4/5] Dumper.xs: Output orphaned EBCDIC control as octal This makes the code simpler, and removes the need to worry about and comment on EBCDIC. On ASCII machines there are the C0 controls, the C1 controls, and DEL, which isn't technically in either set. The C0 and DEL controls are treated as low ordinal, and output using octal notation. This commit has no behavior changes on ASCII platforms. On EBCDIC machines, there are 1-1 mappings to the entire set of 65 ASCII controls. All but one are in a single block and have been output using octal. This commit doesn't change the behavior of the 64 single-block controls. There is a lone control that isn't adjacent to the others, orphaned. This commit's only effect is to cause it to be displayed using octal instead of hex. I believe the simplification of the code warrants this change. On extant EBCDIC platforms that Perl supports, this control is 0xFF, named EO or EIGHT ONES, and is somewhat like DEL on ASCII platforms, which we already display as octal, even though it is much higher ordinal than any other control displayed as octal. --- dist/Data-Dumper/Dumper.xs | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index b5d7e39fc662..0c1b98773e3e 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -254,13 +254,10 @@ esc_q_utf8(pTHX_ SV* sv, const char *src, STRLEN slen, I32 do_utf8, I32 useqq) normal++; } } - else if (! isASCII(k) && k > ' ') { - /* High ordinal non-printable code point. (The test that k is - * above SPACE should be optimized out by the compiler on - * non-EBCDIC platforms; otherwise we could put an #ifdef around - * it, but it's better to have just a single code path when - * possible. All but one of the non-ASCII EBCDIC controls are low - * ordinal; that one is the only one above SPACE.) + else if (! UTF8_IS_INVARIANT(k)) { + /* We treat as low ordinal any code point whose representation is + * the same under UTF-8 as not. Thus, this is a high ordinal code + * point. * * If UTF-8, output as hex, regardless of useqq. This means there * is an overhead of 4 chars '\x{}'. Then count the number of hex @@ -329,18 +326,10 @@ esc_q_utf8(pTHX_ SV* sv, const char *src, STRLEN slen, I32 do_utf8, I32 useqq) U8 c0 = *(U8 *)s; UV k; - if (do_utf8 - && ! isASCII(c0) - /* Exclude non-ASCII low ordinal controls. This should be - * optimized out by the compiler on ASCII platforms; if not - * could wrap it in a #ifdef EBCDIC, but better to avoid - * #if's if possible */ - && c0 > ' ' - ) { - - /* When in UTF-8, we output all non-ascii chars as \x{} - * reqardless of useqq, except for the low ordinal controls on - * EBCDIC platforms */ + if (do_utf8 && ! UTF8_IS_INVARIANT(c0)) { + + /* In UTF-8, we output as \x{} all chars that require more than + * a single byte in UTF-8 to represent. */ k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL); /* treat invalid utf8 byte by byte. This loop iteration gets the @@ -602,7 +591,9 @@ dump_regexp(pTHX_ SV *retval, SV *val) k = *p; } - if ((k == '/' && !saw_backslash) || (do_utf8 && ! isASCII(k) && k > ' ')) { + if ((k == '/' && !saw_backslash) || ( do_utf8 + && ! UTF8_IS_INVARIANT(k))) + { STRLEN to_copy = p - (U8 *) rval; if (to_copy) { /* If saw_backslash is true, this will copy the \ for us too. */ From 3b574ea7eb759e1414cf2673697deac99a4f8851 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 25 May 2021 12:28:47 -0600 Subject: [PATCH 5/5] Dumper.xs: Add comment --- dist/Data-Dumper/Dumper.xs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index 0c1b98773e3e..6d73beca3027 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -605,7 +605,10 @@ dump_regexp(pTHX_ SV *retval, SV *val) } else { /* If there was a \, we have copied it already, so all that is - * left to do here is the \x{...} escaping. */ + * left to do here is the \x{...} escaping. + * + * Since this is a pattern, presumably created by perl, we can + * assume it is well-formed */ k = utf8_to_uvchr_buf(p, rend, NULL); sv_catpvf(retval, "\\x{%" UVxf "}", k); p += UTF8SKIP(p);