From bd20d3babfcec6108daefb3a5d4d439b583e9c94 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 5 May 2020 17:55:58 -0600 Subject: [PATCH 1/2] regexec.c: Refactor macro to generalize it This is in preparation for a somewhat different use to be added. --- regexec.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/regexec.c b/regexec.c index 491888a154fe..3a8fb8395b8c 100644 --- a/regexec.c +++ b/regexec.c @@ -1884,13 +1884,12 @@ STMT_START { previous_occurrence_end = s; \ } -/* This differs from the above macros in that it is passed a single byte that - * is known to begin the next occurrence of the thing being looked for in 's'. - * It does a memchr to find the next occurrence of 'byte', before trying 'COND' - * at that position. */ -#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ +/* This is like the above macro except the function returns NULL if there is no + * occurrence, and there is a further condition that must be matched besides + * the function */ +#define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \ while (s < strend) { \ - s = (char *) memchr(s, byte, strend -s); \ + s = (char *) f; \ if (s == NULL) { \ s = (char *) strend; \ break; \ @@ -1906,6 +1905,14 @@ STMT_START { } \ } +/* This differs from the above macros in that it is passed a single byte that + * is known to begin the next occurrence of the thing being looked for in 's'. + * It does a memchr to find the next occurrence of 'byte', before trying 'COND' + * at that position. */ +#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ + REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \ + COND) + /* The four macros below are slightly different versions of the same logic. * * The first is for /a and /aa when the target string is UTF-8. This can only From 414d8142bc79098973d41fcb96a8a0ebf67e15a0 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 5 May 2020 18:04:55 -0600 Subject: [PATCH 2/2] regexec.c: Slight speed up of intuit matching For an operation where we know there is an initial substring, the code now looks for that whole substring before returning a candidate position. Previously it looked for just the first byte, and then the condition had to include looking for the remainder. Effectively memmem() is now called, instead of memchr() followed by a comparison of the rest. --- regexec.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/regexec.c b/regexec.c index 3a8fb8395b8c..4c2eba631717 100644 --- a/regexec.c +++ b/regexec.c @@ -1913,6 +1913,13 @@ STMT_START { REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \ COND) +/* This is like the function above, but takes an entire string to look for + * instead of a single byte */ +#define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \ + REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \ + ninstr(s, strend, substr, substr_end), \ + COND) + /* The four macros below are slightly different versions of the same logic. * * The first is for /a and /aa when the target string is UTF-8. This can only @@ -2318,10 +2325,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFHs_t8_pb: case ANYOFHs_t8_p8: - REXEC_FBC_UTF8_CLASS_SCAN( - ( strend -s >= FLAGS(c) - && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c)) - && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */))); + REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN( + ((struct regnode_anyofhs *) c)->string, + ((struct regnode_anyofhs *) c)->string + FLAGS(c), + reginclass(prog, c, (U8*)s, (U8*) strend, + 1 /* is utf8 */)); break; case ANYOFR_tb_pb: