Skip to content

Commit

Permalink
regexec.c: refactor find-by-class EXACTish code
Browse files Browse the repository at this point in the history
This code is way out-of-date, using upper and lower case instead of fold-case.
  • Loading branch information
Karl Williamson committed Feb 14, 2011
1 parent a33c29b commit fac1af7
Showing 1 changed file with 112 additions and 14 deletions.
126 changes: 112 additions & 14 deletions regexec.c
Expand Up @@ -1256,8 +1256,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
if ( f != c \
&& (f == c1 || f == c2) \
&& (ln == len || \
foldEQ_utf8(s, &my_strend, 0, utf8_target,\
m, NULL, ln, cBOOL(UTF_PATTERN)))\
foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,\
m, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags))\
&& (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
} \
Expand All @@ -1266,17 +1266,9 @@ s += len

#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
STMT_START { \
re_fold_t folder; \
switch (OP(c)) { \
case EXACTFU: folder = foldEQ_latin1; break; \
case EXACTFL: folder = foldEQ_locale; break; \
case EXACTF: folder = foldEQ; break; \
default: \
Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
} \
while (s <= e) { \
if ( (CoNd) \
&& (ln == 1 || folder(s, m, ln)) \
&& (ln == 1 || folder(s, pat_string, ln)) \
&& (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
s++; \
Expand Down Expand Up @@ -1447,15 +1439,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
{
dVAR;
const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
char *m;
char *pat_string; /* The pattern's exactish string */
char *pat_end; /* ptr to end char of pat_string */
re_fold_t folder; /* Function for computing non-utf8 folds */
const U8 *fold_array; /* array for folding ords < 256 */
STRLEN ln;
STRLEN lnc;
register STRLEN uskip;
unsigned int c1;
unsigned int c2;
U8 c1;
U8 c2;
char *e;
register I32 tmp = 1; /* Scratch variable? */
register const bool utf8_target = PL_reg_match_utf8;
UV utf8_fold_flags;
RXi_GET_DECL(prog,progi);

PERL_ARGS_ASSERT_FIND_BYCLASS;
Expand Down Expand Up @@ -1498,7 +1494,108 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
);
break;
case EXACTFU:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = 0;
goto do_exactf_utf8;
}
fold_array = PL_fold_latin1;
folder = foldEQ_latin1;
/* XXX This uses the full utf8 fold because if the pattern contains
* 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
* There could be a new node type, say EXACTFU_SS, which is
* generated by regcomp only if there is an 'ss', and then every
* other case could goto do_exactf_non_utf8;*/
goto do_exactf_utf8;

case EXACTF:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = 0;
goto do_exactf_utf8;
}
fold_array = PL_fold;
folder = foldEQ;
goto do_exactf_non_utf8;

case EXACTFL:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = 0; /* XXX, add new flag for locale */
goto do_exactf_utf8;
}
fold_array = PL_fold_locale;
folder = foldEQ_locale;

/* FALL THROUGH */

do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */

/* The idea in the non-utf8 EXACTF* cases is to first find the
* first character of the EXACTF* node and then, if necessary,
* case-insensitively compare the full text of the node. c1 is the
* first character. c2 is its fold. This logic will not work for
* Unicode semantics and the german sharp ss, which hence should
* not be compiled into a node that gets here. */
pat_string = STRING(c);
ln = STR_LEN(c); /* length to match in octets/bytes */

e = HOP3c(strend, -((I32)ln), s);

if (!reginfo && e < s) {
e = s; /* Due to minlen logic of intuit() */
}

c1 = *pat_string;
c2 = fold_array[c1];
if (c1 == c2) { /* If char and fold are the same */
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
}
else {
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
}
break;

do_exactf_utf8:

/* If one of the operands is in utf8, we can't use the simpler
* folding above, due to the fact that many different characters
* can have the same fold, or portion of a fold, or different-
* length fold */
pat_string = STRING(c);
ln = STR_LEN(c); /* length to match in octets/bytes */
pat_end = pat_string + ln;
lnc = (UTF_PATTERN) /* length to match in characters */
? utf8_length((U8 *) pat_string, (U8 *) pat_end)
: ln;

e = HOP3c(strend, -((I32)lnc), s);

if (!reginfo && e < s) {
e = s; /* Due to minlen logic of intuit() */
}

while (s <= e) {
char *my_strend= (char *)strend;
if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
&& (!reginfo || regtry(reginfo, &s)) )
{
goto got_it;
}
s += UTF8SKIP(s);
}
break;


#if 0
case EXACTFA:
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_exactf_non_locale;

case EXACTFU:
case EXACTF:
utf8_fold_flags = 0;

do_exactf_non_locale:

m = STRING(c);
ln = STR_LEN(c); /* length to match in octets/bytes */
lnc = (I32) ln; /* length to match in characters */
Expand Down Expand Up @@ -1625,6 +1722,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
}
break;
#endif
case BOUNDL:
PL_reg_flags |= RF_tainted;
FBC_BOUND(isALNUM_LC,
Expand Down

0 comments on commit fac1af7

Please sign in to comment.