From bd20d3babfcec6108daefb3a5d4d439b583e9c94 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 5 May 2020 17:55:58 -0600
Subject: [PATCH 1/2] regexec.c: Refactor macro to generalize it

This is in preparation for a somewhat different use to be added.
---
 regexec.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/regexec.c b/regexec.c
index 491888a154fe..3a8fb8395b8c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1884,13 +1884,12 @@ STMT_START {
         previous_occurrence_end = s;                        \
     }
 
-/* This differs from the above macros in that it is passed a single byte that
- * is known to begin the next occurrence of the thing being looked for in 's'.
- * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
- * at that position. */
-#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
+/* This is like the above macro except the function returns NULL if there is no
+ * occurrence, and there is a further condition that must be matched besides
+ * the function */
+#define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
     while (s < strend) {                                    \
-        s = (char *) memchr(s, byte, strend -s);            \
+        s = (char *) f;                                     \
         if (s == NULL) {                                    \
             s = (char *) strend;                            \
             break;                                          \
@@ -1906,6 +1905,14 @@ STMT_START {
         }                                                   \
     }
 
+/* This differs from the above macros in that it is passed a single byte that
+ * is known to begin the next occurrence of the thing being looked for in 's'.
+ * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
+ * at that position. */
+#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
+    REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
+                                              COND)
+
 /* The four macros below are slightly different versions of the same logic.
  *
  * The first is for /a and /aa when the target string is UTF-8.  This can only

From 414d8142bc79098973d41fcb96a8a0ebf67e15a0 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 5 May 2020 18:04:55 -0600
Subject: [PATCH 2/2] regexec.c: Slight speed up of intuit matching

For an operation where we know there is an initial substring, the code
now looks for that whole substring before returning a candidate
position.  Previously it looked for just the first byte, and then the
condition had to include looking for the remainder.  Effectively
memmem() is now called, instead of memchr() followed by a comparison of
the rest.
---
 regexec.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/regexec.c b/regexec.c
index 3a8fb8395b8c..4c2eba631717 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1913,6 +1913,13 @@ STMT_START {
     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
                                               COND)
 
+/* This is like the function above, but takes an entire string to look for
+ * instead of a single byte */
+#define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
+    REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
+                                     ninstr(s, strend, substr, substr_end), \
+                                     COND)
+
 /* The four macros below are slightly different versions of the same logic.
  *
  * The first is for /a and /aa when the target string is UTF-8.  This can only
@@ -2318,10 +2325,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
       case ANYOFHs_t8_pb:
       case ANYOFHs_t8_p8:
-        REXEC_FBC_UTF8_CLASS_SCAN(
-                (   strend -s >= FLAGS(c)
-                && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
-                && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
+        REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
+                        ((struct regnode_anyofhs *) c)->string,
+                        ((struct regnode_anyofhs *) c)->string + FLAGS(c),
+                        reginclass(prog, c, (U8*)s, (U8*) strend,
+                                   1 /* is utf8 */));
         break;
 
       case ANYOFR_tb_pb: