forked from HypothesisWorks/hypothesis
/
regex.py
501 lines (421 loc) · 18.2 KB
/
regex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
import operator
import re
try: # pragma: no cover
import re._constants as sre
import re._parser as sre_parse
except ImportError: # Python < 3.11
import sre_constants as sre # type: ignore
import sre_parse # type: ignore
from hypothesis import reject, strategies as st
from hypothesis.internal.charmap import as_general_categories, categories
from hypothesis.internal.compat import int_to_byte
UNICODE_CATEGORIES = set(categories())
SPACE_CHARS = set(" \t\n\r\f\v")
UNICODE_SPACE_CHARS = SPACE_CHARS | set("\x1c\x1d\x1e\x1f\x85")
UNICODE_DIGIT_CATEGORIES = {"Nd"}
UNICODE_SPACE_CATEGORIES = set(as_general_categories("Z"))
UNICODE_LETTER_CATEGORIES = set(as_general_categories("L"))
UNICODE_WORD_CATEGORIES = set(as_general_categories(["L", "N"]))
# This is verbose, but correct on all versions of Python
BYTES_ALL = {int_to_byte(i) for i in range(256)}
BYTES_DIGIT = {b for b in BYTES_ALL if re.match(b"\\d", b)}
BYTES_SPACE = {b for b in BYTES_ALL if re.match(b"\\s", b)}
BYTES_WORD = {b for b in BYTES_ALL if re.match(b"\\w", b)}
BYTES_LOOKUP = {
sre.CATEGORY_DIGIT: BYTES_DIGIT,
sre.CATEGORY_SPACE: BYTES_SPACE,
sre.CATEGORY_WORD: BYTES_WORD,
sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
}
GROUP_CACHE_STRATEGY = st.shared(st.builds(dict), key="hypothesis.regex.group_cache")
@st.composite
def update_group(draw, group_name, strategy):
cache = draw(GROUP_CACHE_STRATEGY)
result = draw(strategy)
cache[group_name] = result
return result
@st.composite
def reuse_group(draw, group_name):
cache = draw(GROUP_CACHE_STRATEGY)
try:
return cache[group_name]
except KeyError:
reject()
@st.composite
def group_conditional(draw, group_name, if_yes, if_no):
cache = draw(GROUP_CACHE_STRATEGY)
if group_name in cache:
return draw(if_yes)
else:
return draw(if_no)
@st.composite
def clear_cache_after_draw(draw, base_strategy):
cache = draw(GROUP_CACHE_STRATEGY)
result = draw(base_strategy)
cache.clear()
return result
class Context:
__slots__ = ["flags"]
def __init__(self, flags):
self.flags = flags
class CharactersBuilder:
"""Helper object that allows to configure `characters` strategy with
various unicode categories and characters. Also allows negation of
configured set.
:param negate: If True, configure :func:`hypothesis.strategies.characters`
to match anything other than configured character set
:param flags: Regex flags. They affect how and which characters are matched
"""
def __init__(self, negate=False, flags=0):
self._categories = set()
self._whitelist_chars = set()
self._blacklist_chars = set()
self._negate = negate
self._ignorecase = flags & re.IGNORECASE
self._unicode = not bool(flags & re.ASCII)
self.code_to_char = chr
@property
def strategy(self):
"""Returns resulting strategy that generates configured char set."""
max_codepoint = None if self._unicode else 127
# Due to the .swapcase() issue described below (and in issue #2657),
# self._whitelist_chars may contain strings of len > 1. We therefore
# have some extra logic to filter them out of st.characters() args,
# but still generate them if allowed to.
if self._negate:
black_chars = self._blacklist_chars - self._whitelist_chars
return st.characters(
blacklist_categories=self._categories | {"Cc", "Cs"},
blacklist_characters={c for c in self._whitelist_chars if len(c) == 1},
whitelist_characters=black_chars,
max_codepoint=max_codepoint,
)
white_chars = self._whitelist_chars - self._blacklist_chars
multi_chars = {c for c in white_chars if len(c) > 1}
char_strategy = st.characters(
whitelist_categories=self._categories,
blacklist_characters=self._blacklist_chars,
whitelist_characters=white_chars - multi_chars,
max_codepoint=max_codepoint,
)
if multi_chars:
char_strategy |= st.sampled_from(sorted(multi_chars))
return char_strategy
def add_category(self, category):
"""Update unicode state to match sre_parse object ``category``."""
if category == sre.CATEGORY_DIGIT:
self._categories |= UNICODE_DIGIT_CATEGORIES
elif category == sre.CATEGORY_NOT_DIGIT:
self._categories |= UNICODE_CATEGORIES - UNICODE_DIGIT_CATEGORIES
elif category == sre.CATEGORY_SPACE:
self._categories |= UNICODE_SPACE_CATEGORIES
self._whitelist_chars |= (
UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS
)
elif category == sre.CATEGORY_NOT_SPACE:
self._categories |= UNICODE_CATEGORIES - UNICODE_SPACE_CATEGORIES
self._blacklist_chars |= (
UNICODE_SPACE_CHARS if self._unicode else SPACE_CHARS
)
elif category == sre.CATEGORY_WORD:
self._categories |= UNICODE_WORD_CATEGORIES
self._whitelist_chars.add("_")
elif category == sre.CATEGORY_NOT_WORD:
self._categories |= UNICODE_CATEGORIES - UNICODE_WORD_CATEGORIES
self._blacklist_chars.add("_")
else:
raise NotImplementedError(f"Unknown character category: {category}")
def add_char(self, char):
"""Add given char to the whitelist."""
c = self.code_to_char(char)
self._whitelist_chars.add(c)
if (
self._ignorecase
and re.match(re.escape(c), c.swapcase(), flags=re.IGNORECASE) is not None
):
# Note that it is possible that `len(c.swapcase()) > 1`
self._whitelist_chars.add(c.swapcase())
class BytesBuilder(CharactersBuilder):
def __init__(self, negate=False, flags=0):
self._whitelist_chars = set()
self._blacklist_chars = set()
self._negate = negate
self._ignorecase = flags & re.IGNORECASE
self.code_to_char = int_to_byte
@property
def strategy(self):
"""Returns resulting strategy that generates configured char set."""
allowed = self._whitelist_chars
if self._negate:
allowed = BYTES_ALL - allowed
return st.sampled_from(sorted(allowed))
def add_category(self, category):
"""Update characters state to match sre_parse object ``category``."""
self._whitelist_chars |= BYTES_LOOKUP[category]
@st.composite
def maybe_pad(draw, regex, strategy, left_pad_strategy, right_pad_strategy):
"""Attempt to insert padding around the result of a regex draw while
preserving the match."""
result = draw(strategy)
left_pad = draw(left_pad_strategy)
if left_pad and regex.search(left_pad + result):
result = left_pad + result
right_pad = draw(right_pad_strategy)
if right_pad and regex.search(result + right_pad):
result += right_pad
return result
def base_regex_strategy(regex, parsed=None):
if parsed is None:
parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
return clear_cache_after_draw(
_strategy(parsed, Context(flags=regex.flags), isinstance(regex.pattern, str))
)
def regex_strategy(regex, fullmatch, *, _temp_jsonschema_hack_no_end_newline=False):
if not hasattr(regex, "pattern"):
regex = re.compile(regex)
is_unicode = isinstance(regex.pattern, str)
parsed = sre_parse.parse(regex.pattern, flags=regex.flags)
if fullmatch:
if not parsed:
return st.just("" if is_unicode else b"")
return base_regex_strategy(regex, parsed).filter(regex.fullmatch)
if not parsed:
if is_unicode:
return st.text()
else:
return st.binary()
if is_unicode:
base_padding_strategy = st.text()
empty = st.just("")
newline = st.just("\n")
else:
base_padding_strategy = st.binary()
empty = st.just(b"")
newline = st.just(b"\n")
right_pad = base_padding_strategy
left_pad = base_padding_strategy
if parsed[-1][0] == sre.AT:
if parsed[-1][1] == sre.AT_END_STRING:
right_pad = empty
elif parsed[-1][1] == sre.AT_END:
if regex.flags & re.MULTILINE:
right_pad = st.one_of(
empty, st.builds(operator.add, newline, right_pad)
)
else:
right_pad = st.one_of(empty, newline)
# This will be removed when a regex-syntax-translation library exists.
# It's a pretty nasty hack, but means that we can match the semantics
# of JSONschema's compatible subset of ECMA regex, which is important
# for hypothesis-jsonschema and Schemathesis. See e.g.
# https://github.com/schemathesis/schemathesis/issues/1241
if _temp_jsonschema_hack_no_end_newline:
right_pad = empty
if parsed[0][0] == sre.AT:
if parsed[0][1] == sre.AT_BEGINNING_STRING:
left_pad = empty
elif parsed[0][1] == sre.AT_BEGINNING:
if regex.flags & re.MULTILINE:
left_pad = st.one_of(empty, st.builds(operator.add, left_pad, newline))
else:
left_pad = empty
base = base_regex_strategy(regex, parsed).filter(regex.search)
return maybe_pad(regex, base, left_pad, right_pad)
def _strategy(codes, context, is_unicode):
"""Convert SRE regex parse tree to strategy that generates strings matching
that regex represented by that parse tree.
`codes` is either a list of SRE regex elements representations or a
particular element representation. Each element is a tuple of element code
(as string) and parameters. E.g. regex 'ab[0-9]+' compiles to following
elements:
[
(LITERAL, 97),
(LITERAL, 98),
(MAX_REPEAT, (1, 4294967295, [
(IN, [
(RANGE, (48, 57))
])
]))
]
The function recursively traverses regex element tree and converts each
element to strategy that generates strings that match that element.
Context stores
1. List of groups (for backreferences)
2. Active regex flags (e.g. IGNORECASE, DOTALL, UNICODE, they affect
behavior of various inner strategies)
"""
def recurse(codes):
return _strategy(codes, context, is_unicode)
if is_unicode:
empty = ""
to_char = chr
else:
empty = b""
to_char = int_to_byte
binary_char = st.binary(min_size=1, max_size=1)
if not isinstance(codes, tuple):
# List of codes
strategies = []
i = 0
while i < len(codes):
if codes[i][0] == sre.LITERAL and not context.flags & re.IGNORECASE:
# Merge subsequent "literals" into one `just()` strategy
# that generates corresponding text if no IGNORECASE
j = i + 1
while j < len(codes) and codes[j][0] == sre.LITERAL:
j += 1
if i + 1 < j:
chars = (to_char(charcode) for _, charcode in codes[i:j])
strategies.append(st.just(empty.join(chars)))
i = j
continue
strategies.append(recurse(codes[i]))
i += 1
# We handle this separately at the top level, but some regex can
# contain empty lists internally, so we need to handle this here too.
if not strategies:
return st.just(empty)
if len(strategies) == 1:
return strategies[0]
return st.tuples(*strategies).map(empty.join)
else:
# Single code
code, value = codes
if code == sre.LITERAL:
# Regex 'a' (single char)
c = to_char(value)
if (
context.flags & re.IGNORECASE
and c != c.swapcase()
and re.match(re.escape(c), c.swapcase(), re.IGNORECASE) is not None
):
# We do the explicit check for swapped-case matching because
# eg 'ß'.upper() == 'SS' and ignorecase doesn't match it.
return st.sampled_from([c, c.swapcase()])
return st.just(c)
elif code == sre.NOT_LITERAL:
# Regex '[^a]' (negation of a single char)
c = to_char(value)
blacklist = {c}
if (
context.flags & re.IGNORECASE
and re.match(re.escape(c), c.swapcase(), re.IGNORECASE) is not None
):
# There are a few cases where .swapcase() returns two characters,
# but is still a case-insensitive match. In such cases we add *both*
# characters to our blacklist, to avoid doing the wrong thing for
# patterns such as r"[^\u0130]+" where "i\u0307" matches.
#
# (that's respectively 'Latin letter capital I with dot above' and
# 'latin latter i' + 'combining dot above'; see issue #2657)
#
# As a final additional wrinkle, "latin letter capital I" *also*
# case-insensitive-matches, with or without combining dot character.
# We therefore have to chain .swapcase() calls until a fixpoint.
stack = [c.swapcase()]
while stack:
for char in stack.pop():
blacklist.add(char)
stack.extend(set(char.swapcase()) - blacklist)
if is_unicode:
return st.characters(blacklist_characters=blacklist)
else:
return binary_char.filter(lambda c: c not in blacklist)
elif code == sre.IN:
# Regex '[abc0-9]' (set of characters)
negate = value[0][0] == sre.NEGATE
if is_unicode:
builder = CharactersBuilder(negate, context.flags)
else:
builder = BytesBuilder(negate, context.flags)
for charset_code, charset_value in value:
if charset_code == sre.NEGATE:
# Regex '[^...]' (negation)
# handled by builder = CharactersBuilder(...) above
pass
elif charset_code == sre.LITERAL:
# Regex '[a]' (single char)
builder.add_char(charset_value)
elif charset_code == sre.RANGE:
# Regex '[a-z]' (char range)
low, high = charset_value
for char_code in range(low, high + 1):
builder.add_char(char_code)
elif charset_code == sre.CATEGORY:
# Regex '[\w]' (char category)
builder.add_category(charset_value)
else:
# Currently there are no known code points other than
# handled here. This code is just future proofing
raise NotImplementedError(f"Unknown charset code: {charset_code}")
return builder.strategy
elif code == sre.ANY:
# Regex '.' (any char)
if is_unicode:
if context.flags & re.DOTALL:
return st.characters()
return st.characters(blacklist_characters="\n")
else:
if context.flags & re.DOTALL:
return binary_char
return binary_char.filter(lambda c: c != b"\n")
elif code == sre.AT:
# Regexes like '^...', '...$', '\bfoo', '\Bfoo'
# An empty string (or newline) will match the token itself, but
# we don't and can't check the position (eg '%' at the end)
return st.just(empty)
elif code == sre.SUBPATTERN:
# Various groups: '(...)', '(:...)' or '(?P<name>...)'
old_flags = context.flags
context.flags = (context.flags | value[1]) & ~value[2]
strat = _strategy(value[-1], context, is_unicode)
context.flags = old_flags
if value[0]:
strat = update_group(value[0], strat)
return strat
elif code == sre.GROUPREF:
# Regex '\\1' or '(?P=name)' (group reference)
return reuse_group(value)
elif code == sre.ASSERT:
# Regex '(?=...)' or '(?<=...)' (positive lookahead/lookbehind)
return recurse(value[1])
elif code == sre.ASSERT_NOT:
# Regex '(?!...)' or '(?<!...)' (negative lookahead/lookbehind)
return st.just(empty)
elif code == sre.BRANCH:
# Regex 'a|b|c' (branch)
return st.one_of([recurse(branch) for branch in value[1]])
elif code in [sre.MIN_REPEAT, sre.MAX_REPEAT]:
# Regexes 'a?', 'a*', 'a+' and their non-greedy variants
# (repeaters)
at_least, at_most, subregex = value
if at_most == sre.MAXREPEAT:
at_most = None
if at_least == 0 and at_most == 1:
return st.just(empty) | recurse(subregex)
return st.lists(recurse(subregex), min_size=at_least, max_size=at_most).map(
empty.join
)
elif code == sre.GROUPREF_EXISTS:
# Regex '(?(id/name)yes-pattern|no-pattern)'
# (if group exists choice)
return group_conditional(
value[0],
recurse(value[1]),
recurse(value[2]) if value[2] else st.just(empty),
)
else:
# Currently there are no known code points other than handled here.
# This code is just future proofing
raise NotImplementedError(f"Unknown code point: {code!r}")