From 9eef41fce092c25c852373c68dab340d3b3754e4 Mon Sep 17 00:00:00 2001 From: yaphott Date: Tue, 1 Aug 2023 22:23:29 -0500 Subject: [PATCH 01/16] Underscore instead of hyphen for publish in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8392872..9115bef 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PYTHON=python3 -APP_NAME=regex-toolkit +APP_NAME=regex_toolkit install: ${PYTHON} -m pip install . From e360de1bdd74fffc8ac2f56d27002d6983f77c50 Mon Sep 17 00:00:00 2001 From: yaphott Date: Fri, 4 Aug 2023 18:40:31 -0500 Subject: [PATCH 02/16] Increment version --- src/regex_toolkit/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index de90ba6..c25d076 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -17,7 +17,7 @@ to_utf8, ) -__version__ = "0.0.4" +__version__ = "0.0.5" __all__ = [ "escape", From 64ee7dbf9ab1d3a0efcc44bdd3345bb9d3c0a605 Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 10 Aug 2023 19:25:20 -0500 Subject: [PATCH 03/16] Prefer shorter codepoint values when escaping for RE2 --- src/regex_toolkit/base.py | 2 +- tests/test_base.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index ca6a983..84ac1f6 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -53,7 +53,7 @@ def _escape2(char: str) -> str: return f"\\{char}" else: # Otherwise escape using the codepoint - return "\\x{" + char_to_cpoint(char) + "}" + return "\\x{" + char_to_cpoint(char).removeprefix("0000") + "}" def string_as_exp(text: str, flavor: int = 1) -> str: diff --git a/tests/test_base.py b/tests/test_base.py index afc1259..78295da 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -71,9 +71,26 @@ def test_escapable(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(char)) + def test_trimmed(self): + expected_exp = "\\x{00b0}" + actual_exp = regex_toolkit.escape("Β°", self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match("Β°")) + + def test_untrimmed(self): + expected_exp = "\\x{0001f170}" + actual_exp = regex_toolkit.escape("πŸ…°", self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match("πŸ…°")) + def test_unknown(self): # TODO: Include additional characters to test. - # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8) + "}" + # TODO: Cover chars that would be trimmed. + # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8).removeprefix("0000") + "}" for char, expected_exp in ( # Length 1 ("πŸ…°", r"\x{0001f170}"), @@ -237,6 +254,8 @@ def test_escapable_joined_as_one(self): self.assertTrue(pattern.match(text)) def test_unknown_joined_as_one(self): + # TODO: Include additional characters to test. + # TODO: Cover chars that would be trimmed. text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" expected_exp = r"".join( ( From 1fee7e629c6c34f89396767712b3883f5eeebef3 Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 10 Aug 2023 19:26:40 -0500 Subject: [PATCH 04/16] More uniformity in tests for regex expressions --- tests/test_base.py | 238 ++++++++++++++++++++++----------------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index 78295da..a25b6b4 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -93,46 +93,46 @@ def test_unknown(self): # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8).removeprefix("0000") + "}" for char, expected_exp in ( # Length 1 - ("πŸ…°", r"\x{0001f170}"), - ("πŸ…±", r"\x{0001f171}"), - ("πŸ…Ύ", r"\x{0001f17e}"), - ("πŸ…Ώ", r"\x{0001f17f}"), - ("πŸ†Ž", r"\x{0001f18e}"), - ("πŸ†‘", r"\x{0001f191}"), - ("πŸ†’", r"\x{0001f192}"), - ("πŸ†“", r"\x{0001f193}"), - ("πŸ†”", r"\x{0001f194}"), - ("πŸ†•", r"\x{0001f195}"), - ("πŸ†–", r"\x{0001f196}"), - ("πŸ†—", r"\x{0001f197}"), - ("πŸ†˜", r"\x{0001f198}"), - ("πŸ†™", r"\x{0001f199}"), - ("πŸ†š", r"\x{0001f19a}"), - ("πŸ‡¦", r"\x{0001f1e6}"), - ("πŸ‡§", r"\x{0001f1e7}"), - ("πŸ‡¨", r"\x{0001f1e8}"), - ("🈁", r"\x{0001f201}"), - ("πŸˆ‚", r"\x{0001f202}"), - ("🈚", r"\x{0001f21a}"), - ("🈯", r"\x{0001f22f}"), - ("🈲", r"\x{0001f232}"), - ("🈳", r"\x{0001f233}"), - ("🈴", r"\x{0001f234}"), - ("🈡", r"\x{0001f235}"), - ("🈢", r"\x{0001f236}"), - ("🈷", r"\x{0001f237}"), - ("🈸", r"\x{0001f238}"), - ("🈹", r"\x{0001f239}"), - ("🈺", r"\x{0001f23a}"), - ("πŸ‰", r"\x{0001f250}"), - ("πŸ‰‘", r"\x{0001f251}"), - ("πŸŒ€", r"\x{0001f300}"), - ("🌁", r"\x{0001f301}"), - ("πŸŒ‚", r"\x{0001f302}"), - ("πŸŒƒ", r"\x{0001f303}"), - ("πŸŒ„", r"\x{0001f304}"), + ("πŸ…°", "\\x{0001f170}"), + ("πŸ…±", "\\x{0001f171}"), + ("πŸ…Ύ", "\\x{0001f17e}"), + ("πŸ…Ώ", "\\x{0001f17f}"), + ("πŸ†Ž", "\\x{0001f18e}"), + ("πŸ†‘", "\\x{0001f191}"), + ("πŸ†’", "\\x{0001f192}"), + ("πŸ†“", "\\x{0001f193}"), + ("πŸ†”", "\\x{0001f194}"), + ("πŸ†•", "\\x{0001f195}"), + ("πŸ†–", "\\x{0001f196}"), + ("πŸ†—", "\\x{0001f197}"), + ("πŸ†˜", "\\x{0001f198}"), + ("πŸ†™", "\\x{0001f199}"), + ("πŸ†š", "\\x{0001f19a}"), + ("πŸ‡¦", "\\x{0001f1e6}"), + ("πŸ‡§", "\\x{0001f1e7}"), + ("πŸ‡¨", "\\x{0001f1e8}"), + ("🈁", "\\x{0001f201}"), + ("πŸˆ‚", "\\x{0001f202}"), + ("🈚", "\\x{0001f21a}"), + ("🈯", "\\x{0001f22f}"), + ("🈲", "\\x{0001f232}"), + ("🈳", "\\x{0001f233}"), + ("🈴", "\\x{0001f234}"), + ("🈡", "\\x{0001f235}"), + ("🈢", "\\x{0001f236}"), + ("🈷", "\\x{0001f237}"), + ("🈸", "\\x{0001f238}"), + ("🈹", "\\x{0001f239}"), + ("🈺", "\\x{0001f23a}"), + ("πŸ‰", "\\x{0001f250}"), + ("πŸ‰‘", "\\x{0001f251}"), + ("πŸŒ€", "\\x{0001f300}"), + ("🌁", "\\x{0001f301}"), + ("πŸŒ‚", "\\x{0001f302}"), + ("πŸŒƒ", "\\x{0001f303}"), + ("πŸŒ„", "\\x{0001f304}"), # Length 2 - ("πŸŒ…", r"\x{0001f305}"), + ("πŸŒ…", "\\x{0001f305}"), ): with self.subTest(char=char): actual_exp = regex_toolkit.escape(char, self._flavor) @@ -259,46 +259,46 @@ def test_unknown_joined_as_one(self): text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" expected_exp = r"".join( ( - r"\x{0001f170}", - r"\x{0001f171}", - r"\x{0001f17e}", - r"\x{0001f17f}", - r"\x{0001f18e}", - r"\x{0001f191}", - r"\x{0001f192}", - r"\x{0001f193}", - r"\x{0001f194}", - r"\x{0001f195}", - r"\x{0001f196}", - r"\x{0001f197}", - r"\x{0001f198}", - r"\x{0001f199}", - r"\x{0001f19a}", - r"\x{0001f1e6}", - r"\x{0001f1e7}", - r"\x{0001f1e8}", - r"\x{0001f201}", - r"\x{0001f202}", - r"\x{0001f21a}", - r"\x{0001f22f}", - r"\x{0001f232}", - r"\x{0001f233}", - r"\x{0001f234}", - r"\x{0001f235}", - r"\x{0001f236}", - r"\x{0001f237}", - r"\x{0001f238}", - r"\x{0001f239}", - r"\x{0001f23a}", - r"\x{0001f250}", - r"\x{0001f251}", - r"\x{0001f300}", - r"\x{0001f301}", - r"\x{0001f302}", - r"\x{0001f303}", - r"\x{0001f304}", + "\\x{0001f170}", + "\\x{0001f171}", + "\\x{0001f17e}", + "\\x{0001f17f}", + "\\x{0001f18e}", + "\\x{0001f191}", + "\\x{0001f192}", + "\\x{0001f193}", + "\\x{0001f194}", + "\\x{0001f195}", + "\\x{0001f196}", + "\\x{0001f197}", + "\\x{0001f198}", + "\\x{0001f199}", + "\\x{0001f19a}", + "\\x{0001f1e6}", + "\\x{0001f1e7}", + "\\x{0001f1e8}", + "\\x{0001f201}", + "\\x{0001f202}", + "\\x{0001f21a}", + "\\x{0001f22f}", + "\\x{0001f232}", + "\\x{0001f233}", + "\\x{0001f234}", + "\\x{0001f235}", + "\\x{0001f236}", + "\\x{0001f237}", + "\\x{0001f238}", + "\\x{0001f239}", + "\\x{0001f23a}", + "\\x{0001f250}", + "\\x{0001f251}", + "\\x{0001f300}", + "\\x{0001f301}", + "\\x{0001f302}", + "\\x{0001f303}", + "\\x{0001f304}", # Length 2 - r"\x{0001f305}", + "\\x{0001f305}", ) ) actual_exp = regex_toolkit.string_as_exp(text, self._flavor) @@ -309,7 +309,7 @@ def test_unknown_joined_as_one(self): RESERVED_EXPRESSIONS = frozenset( - {r"\A", r"\b", r"\B", r"\d", r"\D", r"\s", r"\S", r"\w", r"\W", r"\Z", r"\1"} + {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"} ) @@ -473,46 +473,46 @@ def test_unsafe_of_variable_lengths(self): # Unique combinations of ALWAYS_SAFE using various lengths. elements_map = { # Length 1 - "πŸ…°": r"\x{0001f170}", - "πŸ…±": r"\x{0001f171}", - "πŸ…Ύ": r"\x{0001f17e}", - "πŸ…Ώ": r"\x{0001f17f}", - "πŸ†Ž": r"\x{0001f18e}", - "πŸ†‘": r"\x{0001f191}", - "πŸ†’": r"\x{0001f192}", - "πŸ†“": r"\x{0001f193}", - "πŸ†”": r"\x{0001f194}", - "πŸ†•": r"\x{0001f195}", - "πŸ†–": r"\x{0001f196}", - "πŸ†—": r"\x{0001f197}", - "πŸ†˜": r"\x{0001f198}", - "πŸ†™": r"\x{0001f199}", - "πŸ†š": r"\x{0001f19a}", - "πŸ‡¦": r"\x{0001f1e6}", - "πŸ‡§": r"\x{0001f1e7}", - "πŸ‡¨": r"\x{0001f1e8}", - "🈁": r"\x{0001f201}", - "πŸˆ‚": r"\x{0001f202}", - "🈚": r"\x{0001f21a}", - "🈯": r"\x{0001f22f}", - "🈲": r"\x{0001f232}", - "🈳": r"\x{0001f233}", - "🈴": r"\x{0001f234}", - "🈡": r"\x{0001f235}", - "🈢": r"\x{0001f236}", - "🈷": r"\x{0001f237}", - "🈸": r"\x{0001f238}", - "🈹": r"\x{0001f239}", - "🈺": r"\x{0001f23a}", - "πŸ‰": r"\x{0001f250}", - "πŸ‰‘": r"\x{0001f251}", - "πŸŒ€": r"\x{0001f300}", - "🌁": r"\x{0001f301}", - "πŸŒ‚": r"\x{0001f302}", - "πŸŒƒ": r"\x{0001f303}", - "πŸŒ„": r"\x{0001f304}", + "πŸ…°": "\\x{0001f170}", + "πŸ…±": "\\x{0001f171}", + "πŸ…Ύ": "\\x{0001f17e}", + "πŸ…Ώ": "\\x{0001f17f}", + "πŸ†Ž": "\\x{0001f18e}", + "πŸ†‘": "\\x{0001f191}", + "πŸ†’": "\\x{0001f192}", + "πŸ†“": "\\x{0001f193}", + "πŸ†”": "\\x{0001f194}", + "πŸ†•": "\\x{0001f195}", + "πŸ†–": "\\x{0001f196}", + "πŸ†—": "\\x{0001f197}", + "πŸ†˜": "\\x{0001f198}", + "πŸ†™": "\\x{0001f199}", + "πŸ†š": "\\x{0001f19a}", + "πŸ‡¦": "\\x{0001f1e6}", + "πŸ‡§": "\\x{0001f1e7}", + "πŸ‡¨": "\\x{0001f1e8}", + "🈁": "\\x{0001f201}", + "πŸˆ‚": "\\x{0001f202}", + "🈚": "\\x{0001f21a}", + "🈯": "\\x{0001f22f}", + "🈲": "\\x{0001f232}", + "🈳": "\\x{0001f233}", + "🈴": "\\x{0001f234}", + "🈡": "\\x{0001f235}", + "🈢": "\\x{0001f236}", + "🈷": "\\x{0001f237}", + "🈸": "\\x{0001f238}", + "🈹": "\\x{0001f239}", + "🈺": "\\x{0001f23a}", + "πŸ‰": "\\x{0001f250}", + "πŸ‰‘": "\\x{0001f251}", + "πŸŒ€": "\\x{0001f300}", + "🌁": "\\x{0001f301}", + "πŸŒ‚": "\\x{0001f302}", + "πŸŒƒ": "\\x{0001f303}", + "πŸŒ„": "\\x{0001f304}", # Length 2 - "πŸŒ…": r"\x{0001f305}", + "πŸŒ…": "\\x{0001f305}", } elements = tuple(elements_map) for texts in product(elements, repeat=self._max_combo_length): @@ -529,7 +529,7 @@ def test_unsafe_of_variable_lengths(self): ##############################3 # Exact matches that equate to reserved spaces ##############################3 # E.g. Should match '\\' + 'n', not r'\n' -##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): +##############################3 for text in ("\\w", "\\W", "\\d", "\\D", "\\s", "\\S", "\\1"): ##############################3 texts = [text] ##############################3 with self.subTest(texts=texts): ##############################3 self.assertEqual( From a83878a5ee66c6698244a48f6b601c4a3639f395 Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 10 Aug 2023 20:38:06 -0500 Subject: [PATCH 05/16] Parameters actually accept string chars when iterating a character range --- README.md | 22 +++++++++++----------- src/regex_toolkit/utils.py | 24 ++++++++++++------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 1be1bae..cc191d2 100644 --- a/README.md +++ b/README.md @@ -214,39 +214,39 @@ Form C favors the use of a fully combined character. #### `iter_char_range` ```python -def iter_char_range(first_cpoint: int, - last_cpoint: int) -> Generator[str, None, None] +def iter_char_range(first_char: str, + last_char: str) -> Generator[str, None, None] ``` -Iterate all characters within a range of codepoints (inclusive). +Iterate all characters within a range of characters (inclusive). **Arguments**: -- `first_cpoint` _int_ - Starting (first) codepoint. -- `last_cpoint` _int_ - Ending (last) codepoint. +- `first_char` _str_ - Starting (first) character. +- `last_char` _str_ - Ending (last) character. **Yields**: -- _str_ - Characters within a range of codepoints. +- _str_ - Characters within a range of characters. #### `char_range` ```python -def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...] +def char_range(first_char: str, last_char: str) -> tuple[str, ...] ``` -Tuple of all characters within a range of codepoints (inclusive). +Tuple of all characters within a range of characters (inclusive). **Arguments**: -- `first_cpoint` _int_ - Starting (first) codepoint. -- `last_cpoint` _int_ - Ending (last) codepoint. +- `first_char` _str_ - Starting (first) character. +- `last_char` _str_ - Ending (last) character. **Returns**: -- _tuple[str, ...]_ - Characters within a range of codepoints. +- _tuple[str, ...]_ - Characters within a range of characters. diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index 41c9df3..9b55dfa 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -121,31 +121,31 @@ def to_nfc(text: str) -> str: return unicodedata.normalize("NFC", text) -def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]: - """Iterate all characters within a range of codepoints (inclusive). +def iter_char_range(first_char: str, last_char: str) -> Generator[str, None, None]: + """Iterate all characters within a range of characters (inclusive). Args: - first_cpoint (int): Starting (first) codepoint. - last_cpoint (int): Ending (last) codepoint. + first_char (str): Starting (first) character. + last_char (str): Ending (last) character. Yields: - str: Characters within a range of codepoints. + str: Characters within a range of characters. """ - for i in range(ord(first_cpoint), ord(last_cpoint) + 1): + for i in range(ord(first_char), ord(last_char) + 1): yield chr(i) -def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]: - """Tuple of all characters within a range of codepoints (inclusive). +def char_range(first_char: str, last_char: str) -> tuple[str, ...]: + """Tuple of all characters within a range of characters (inclusive). Args: - first_cpoint (int): Starting (first) codepoint. - last_cpoint (int): Ending (last) codepoint. + first_char (str): Starting (first) character. + last_char (str): Ending (last) character. Returns: - tuple[str, ...]: Characters within a range of codepoints. + tuple[str, ...]: Characters within a range of characters. """ - return tuple(iter_char_range(first_cpoint, last_cpoint)) + return tuple(iter_char_range(first_char, last_char)) def mask_span( From 958414344077c8ee9ebce68a54f4a87eab3aabd8 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 13 Aug 2023 22:56:58 -0500 Subject: [PATCH 06/16] Add regex making function --- README.md | 53 +++++++++- src/regex_toolkit/__init__.py | 3 + src/regex_toolkit/base.py | 176 +++++++++++++++++++++++----------- src/regex_toolkit/utils.py | 25 ++++- tests/test_base.py | 28 ++++++ tests/test_utils.py | 13 +++ 6 files changed, 240 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index cc191d2..4038dfa 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,25 @@ import regex_toolkit # `regex_toolkit.utils` + + +#### `validate_regex_flavor` + +```python +@lru_cache(maxsize=2) +def validate_regex_flavor(flavor: int) -> None | NoReturn +``` + +Validate a regex flavor. + +**Arguments**: + +- `flavor` _int_ - Regex flavor (1 for RE, 2 for RE2). + +**Raises**: + +- `ValueError` - Invalid regex flavor. + #### `iter_sort_by_len` @@ -134,8 +153,8 @@ The codepoint is always 8 characters long (zero-padded). **Example**: ```python -# Output: '00000061' ord_to_cpoint(97) +# Output: '00000061' ``` **Arguments**: @@ -177,8 +196,8 @@ Character to character codepoint. **Example**: ```python -# Output: '00000061' char_to_cpoint("a") +# Output: '00000061' ``` **Arguments**: @@ -367,6 +386,36 @@ Create a regex expression that exactly matches any one string. - `ValueError` - Invalid regex flavor. + + +#### `make_exp` + +```python +def make_exp(chars: Iterable[str], flavor: int = 1) -> str +``` + +Create a regex expression that exactly matches a list of characters. + +**Example**: + +```python +exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]" +# Output: '[a-cx-z]' +``` + +**Arguments**: + +- `chars` _Iterable[str]_ - Characters to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + +**Returns**: + +- _str_ - Expression that exactly matches the original characters. + +**Raises**: + +- `ValueError` - Invalid regex flavor. + # `regex_toolkit.enums` diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index c25d076..7976583 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -1,5 +1,6 @@ from .base import ( escape, + make_exp, string_as_exp, strings_as_exp, ) @@ -15,6 +16,7 @@ sort_by_len, to_nfc, to_utf8, + validate_regex_flavor, ) __version__ = "0.0.5" @@ -34,4 +36,5 @@ "strings_as_exp", "to_nfc", "to_utf8", + "validate_regex_flavor", ] diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 84ac1f6..a89db85 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -2,37 +2,18 @@ "escape", "string_as_exp", "strings_as_exp", + "make_exp", ] -from collections.abc import Iterable +from collections.abc import Callable, Iterable +from typing import Final from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor -from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len - - -def escape(char: str, flavor: int = 1) -> str: - """Create a regex expression that exactly matches a character. - - Args: - char (str): Character to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. - - Returns: - str: Expression that exactly matches the original character. - - Raises: - ValueError: Invalid regex flavor. - """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: - return _escape(char) - # elif flavor == RegexFlavor.RE2: - else: - return _escape2(char) +from regex_toolkit.utils import ( + char_to_cpoint, + iter_sort_by_len, + validate_regex_flavor, +) def _escape(char: str) -> str: @@ -56,29 +37,27 @@ def _escape2(char: str) -> str: return "\\x{" + char_to_cpoint(char).removeprefix("0000") + "}" -def string_as_exp(text: str, flavor: int = 1) -> str: - """Create a regex expression that exactly matches a string. +_ESCAPE_FUNC_MAP: Final[dict[int, Callable]] = { + RegexFlavor.RE: _escape, + RegexFlavor.RE2: _escape2, +} + + +def escape(char: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a character. Args: - text (str): String to match. + char (str): Character to match. flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - str: Expression that exactly matches the original string. + str: Expression that exactly matches the original character. Raises: ValueError: Invalid regex flavor. """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: - return _string_as_exp(text) - # elif flavor == RegexFlavor.RE2: - else: - return _string_as_exp2(text) + validate_regex_flavor(flavor) + return _ESCAPE_FUNC_MAP[flavor](char) def _string_as_exp(text: str) -> str: @@ -89,6 +68,43 @@ def _string_as_exp2(text: str) -> str: return r"".join(map(_escape2, text)) +_STRING_AS_EXP_FUNC_MAP: Final[dict[int, Callable]] = { + RegexFlavor.RE: _string_as_exp, + RegexFlavor.RE2: _string_as_exp2, +} + + +def string_as_exp(text: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a string. + + Args: + text (str): String to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + + Returns: + str: Expression that exactly matches the original string. + + Raises: + ValueError: Invalid regex flavor. + """ + validate_regex_flavor(flavor) + return _STRING_AS_EXP_FUNC_MAP[flavor](text) + + +def _strings_as_exp(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) + + +def _strings_as_exp2(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) + + +_STRINGS_AS_EXP_FUNC_MAP: Final[dict[int, Callable]] = { + RegexFlavor.RE: _strings_as_exp, + RegexFlavor.RE2: _strings_as_exp2, +} + + def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: """Create a regex expression that exactly matches any one string. @@ -102,21 +118,73 @@ def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: - return _strings_as_exp(texts) - # elif flavor == RegexFlavor.RE2: + validate_regex_flavor(flavor) + return _STRINGS_AS_EXP_FUNC_MAP[flavor](texts) + + +def _make_group_exp(group: list[int]) -> str: + if len(group) > 2: + # Represent as a character range + print(f"{group = }") + return _escape(chr(group[0])) + "-" + _escape(chr(group[-1])) else: - return _strings_as_exp2(texts) + # Represent as individual characters + print(f"{group = }") + return "".join((_escape(chr(char_ord)) for char_ord in group)) -def _strings_as_exp(texts: Iterable[str]) -> str: - return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) +def _make_group_exp2(group: list[int]) -> str: + if len(group) > 2: + # Represent as a character range + return _escape2(chr(group[0])) + "-" + _escape2(chr(group[-1])) + else: + # Represent as individual characters + return "".join((_escape2(chr(char_ord)) for char_ord in group)) -def _strings_as_exp2(texts: Iterable[str]) -> str: - return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) +_MAKE_GROUP_EXP_FUNC_MAP: Final[dict[int, Callable]] = { + RegexFlavor.RE: _make_group_exp, + RegexFlavor.RE2: _make_group_exp2, +} + + +def make_exp(chars: Iterable[str], flavor: int = 1) -> str: + """Create a regex expression that exactly matches a list of characters. + + Example: + + ```python + exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]" + # Output: '[a-cx-z]' + ``` + + Args: + chars (Iterable[str]): Characters to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + + Returns: + str: Expression that exactly matches the original characters. + + Raises: + ValueError: Invalid regex flavor. + """ + validate_regex_flavor(flavor) + func = _MAKE_GROUP_EXP_FUNC_MAP[flavor] + + exp = "" + group = [] + for char_ord in sorted(set(map(ord, chars))): + if not group: + # Start first group + group.append(char_ord) + elif char_ord == group[-1] + 1: + # Add to current group + group.append(char_ord) + else: + # Make the group and start a new one + exp += func(group) + group = [char_ord] + if group: + # Make any remaining group + exp += func(group) + return exp diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index 9b55dfa..d0c27ad 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -1,7 +1,12 @@ import unicodedata from collections.abc import Generator, Iterable +from functools import lru_cache +from typing import NoReturn + +from regex_toolkit.enums import RegexFlavor __all__ = [ + "validate_regex_flavor", "iter_sort_by_len", "sort_by_len", "ord_to_cpoint", @@ -16,6 +21,22 @@ ] +@lru_cache(maxsize=2) +def validate_regex_flavor(flavor: int) -> None | NoReturn: + """Validate a regex flavor. + + Args: + flavor (int): Regex flavor (1 for RE, 2 for RE2). + + Raises: + ValueError: Invalid regex flavor. + """ + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") + + def iter_sort_by_len( texts: Iterable[str], *, @@ -59,8 +80,8 @@ def ord_to_cpoint(ordinal: int) -> str: Example: ```python - # Output: '00000061' ord_to_cpoint(97) + # Output: '00000061' ``` Args: @@ -90,8 +111,8 @@ def char_to_cpoint(char: str) -> str: Example: ```python - # Output: '00000061' char_to_cpoint("a") + # Output: '00000061' ``` Args: diff --git a/tests/test_base.py b/tests/test_base.py index a25b6b4..1e71d9e 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -2,6 +2,7 @@ import unittest from itertools import product +import pytest import re2 import regex_toolkit @@ -572,3 +573,30 @@ def test_unsafe_of_variable_lengths(self): ##############################3 ) # TODO: Add tests for actually compiling the e. + + +@pytest.mark.parametrize( + "chars, expected", + ( + # 1 char does not make a range + (["a"], "a"), + # 2 chars should not make a range + (["a", "b"], "ab"), + # 3+ sequential chars make a range + (["a", "b", "c"], "a-c"), + # 3+ non-sequential chars should not make a range + (["a", "c", "e"], "ace"), + # 3+ sequential chars with extra out of range char + (["a", "b", "c", "z"], "a-cz"), + # Chars should always be ordered by ordinal + (["b", "a"], "ab"), + # Chars should always be ordered by ordinal + (["e", "c", "a"], "ace"), + # Chars should always be ordered by ordinal + (["z", "c", "b", "a"], "a-cz"), + # Duplicates should be removed + (["d", "a", "b", "c", "a"], "a-d"), + ), +) +def test_make_exp(chars, expected): + assert regex_toolkit.make_exp(chars) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index c1fccdf..aceaf9b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,22 @@ import unittest from collections.abc import Generator, Iterable +import pytest + import regex_toolkit +@pytest.mark.parametrize("flavor", (1, 2)) +def test_validate_regex_flavor(flavor): + regex_toolkit.validate_regex_flavor(flavor) + + +@pytest.mark.parametrize("flavor", (0, 3)) +def test_validate_regex_flavor_invalid(flavor): + with pytest.raises(ValueError, match=r"^Invalid regex flavor: \d+$"): + regex_toolkit.validate_regex_flavor(flavor) + + def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: prev_len = None for text in texts: From ea95fbbfa5e60e6957c89076643e8ed01ecda814 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 19 Aug 2023 13:40:36 -0500 Subject: [PATCH 07/16] Formatting change to bash commands in readme to make it easier to copy --- README.md | 8 ++++---- docs/templates/install.md.jinja | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4038dfa..e09f927 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/regex-toolkit/): [![PyPI - License](https://img.shields.io/pypi/l/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) ```bash -$ python3 -m pip install regex-toolkit +python3 -m pip install regex-toolkit ``` Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): @@ -48,9 +48,9 @@ Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): ```bash -$ git clone git+https://github.com/Phosmic/regex-toolkit.git -$ cd regex-toolkit -$ python3 -m pip install -e . +git clone git+https://github.com/Phosmic/regex-toolkit.git +cd regex-toolkit +python3 -m pip install -e . ``` --- diff --git a/docs/templates/install.md.jinja b/docs/templates/install.md.jinja index dcc34d3..7a86e51 100644 --- a/docs/templates/install.md.jinja +++ b/docs/templates/install.md.jinja @@ -5,7 +5,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/{{ pypi.name }}/): [![PyPI - License](https://img.shields.io/pypi/l/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) ```bash -$ python3 -m pip install {{ pypi.name }} +python3 -m pip install {{ pypi.name }} ``` Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ repo.name }}): @@ -21,7 +21,7 @@ Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ rep {% endif %} ```bash -$ git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git -$ cd {{ repo.name }} -$ python3 -m pip install -e . +git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git +cd {{ repo.name }} +python3 -m pip install -e . ``` \ No newline at end of file From aced5498920b698c993b466125870b2ab2b6e923 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 19 Aug 2023 13:41:54 -0500 Subject: [PATCH 08/16] Improve readability in a few docstring examples when rendered to markdown --- README.md | 30 ++++++++++++++++++++++++++++++ src/regex_toolkit/base.py | 3 +++ src/regex_toolkit/utils.py | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e09f927..12bdf69 100644 --- a/README.md +++ b/README.md @@ -220,6 +220,13 @@ Normalize a Unicode string to NFC form C. Form C favors the use of a fully combined character. +**Example**: + +```python +to_nfc("e\\u0301") == "Γ©" +# Output: True +``` + **Arguments**: - `text` _str_ - String to normalize. @@ -239,6 +246,16 @@ def iter_char_range(first_char: str, Iterate all characters within a range of characters (inclusive). +**Example**: + +```python +char_range("a", "c") +# Output: ('a', 'b', 'c') + +char_range("c", "a") +# Output: ('c', 'b', 'a') +``` + **Arguments**: - `first_char` _str_ - Starting (first) character. @@ -258,6 +275,16 @@ def char_range(first_char: str, last_char: str) -> tuple[str, ...] Tuple of all characters within a range of characters (inclusive). +**Example**: + +```python +char_range("a", "d") +# Output: ('a', 'b', 'c', 'd') + +char_range("d", "a") +# Output: ('d', 'c', 'b', 'a') +``` + **Arguments**: - `first_char` _str_ - Starting (first) character. @@ -396,6 +423,9 @@ def make_exp(chars: Iterable[str], flavor: int = 1) -> str Create a regex expression that exactly matches a list of characters. +The characters are sorted and grouped into ranges where possible. +The expression is not anchored, so it can be used as part of a larger expression. + **Example**: ```python diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index a89db85..e116b74 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -151,6 +151,9 @@ def _make_group_exp2(group: list[int]) -> str: def make_exp(chars: Iterable[str], flavor: int = 1) -> str: """Create a regex expression that exactly matches a list of characters. + The characters are sorted and grouped into ranges where possible. + The expression is not anchored, so it can be used as part of a larger expression. + Example: ```python diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index d0c27ad..a63c30e 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -133,6 +133,13 @@ def to_nfc(text: str) -> str: Form C favors the use of a fully combined character. + Example: + + ```python + to_nfc("e\\u0301") == "Γ©" + # Output: True + ``` + Args: text (str): String to normalize. @@ -145,6 +152,16 @@ def to_nfc(text: str) -> str: def iter_char_range(first_char: str, last_char: str) -> Generator[str, None, None]: """Iterate all characters within a range of characters (inclusive). + Example: + + ```python + char_range("a", "c") + # Output: ('a', 'b', 'c') + + char_range("c", "a") + # Output: ('c', 'b', 'a') + ``` + Args: first_char (str): Starting (first) character. last_char (str): Ending (last) character. @@ -152,13 +169,29 @@ def iter_char_range(first_char: str, last_char: str) -> Generator[str, None, Non Yields: str: Characters within a range of characters. """ - for i in range(ord(first_char), ord(last_char) + 1): - yield chr(i) + first_ord = ord(first_char) + last_ord = ord(last_char) + if first_ord > last_ord: + ord_range = range(first_ord, last_ord - 1, -1) + else: + ord_range = range(first_ord, last_ord + 1) + for ordinal in ord_range: + yield chr(ordinal) def char_range(first_char: str, last_char: str) -> tuple[str, ...]: """Tuple of all characters within a range of characters (inclusive). + Example: + + ```python + char_range("a", "d") + # Output: ('a', 'b', 'c', 'd') + + char_range("d", "a") + # Output: ('d', 'c', 'b', 'a') + ``` + Args: first_char (str): Starting (first) character. last_char (str): Ending (last) character. From 8089a1f97ac0903675efa7c63e47d790214c07c9 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 20 Aug 2023 00:28:27 -0500 Subject: [PATCH 09/16] Considering test parameterization. - Added `to_nfc` test. - Avoiding lru_cache when evalutating enum (just build expression on script initialization to prevent slowness for now). - Additional/improved edge cases in some tests. - Added reserved regex expressions to constants. --- README.md | 3 +- src/regex_toolkit/__init__.py | 3 + src/regex_toolkit/constants.py | 15 +- src/regex_toolkit/enums.py | 8 + src/regex_toolkit/utils.py | 22 +- tests/test_base.py | 832 +++++++++++---------------------- tests/test_enums.py | 29 +- tests/test_utils.py | 196 +++++--- 8 files changed, 443 insertions(+), 665 deletions(-) diff --git a/README.md b/README.md index 12bdf69..7287d88 100644 --- a/README.md +++ b/README.md @@ -82,8 +82,7 @@ import regex_toolkit #### `validate_regex_flavor` ```python -@lru_cache(maxsize=2) -def validate_regex_flavor(flavor: int) -> None | NoReturn +def validate_regex_flavor(flavor: int) -> None ``` Validate a regex flavor. diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index 7976583..c3f8a79 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -4,6 +4,7 @@ string_as_exp, strings_as_exp, ) +from .enums import RegexFlavor from .utils import ( char_range, char_to_cpoint, @@ -28,9 +29,11 @@ "cpoint_to_ord", "iter_char_range", "iter_sort_by_len", + "make_exp", "mask_span", "mask_spans", "ord_to_cpoint", + "RegexFlavor", "sort_by_len", "string_as_exp", "strings_as_exp", diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index 0d7cc43..abd6b97 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -2,16 +2,20 @@ This module contains constant values used throughout the project. """ +from __future__ import annotations -from typing import Final +from typing import TYPE_CHECKING -from regex_toolkit.enums import RegexFlavor +if TYPE_CHECKING: + from typing import Final __all__ = [ "ALWAYS_ESCAPE", "ALWAYS_SAFE", "ASCIILETTERS", + # "DEFAULT_REGEX_FLAVOR", "DIGITS", + "RESERVED_EXPRESSIONS", ] DIGITS: Final[frozenset[str]] = frozenset(map(chr, b"0123456789")) @@ -22,7 +26,8 @@ ALWAYS_ESCAPE: Final[frozenset[str]] = frozenset( map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) - -REGEX_FLAVORS: Final[frozenset[RegexFlavor]] = frozenset( - {RegexFlavor.RE, RegexFlavor.RE2} +RESERVED_EXPRESSIONS: Final[frozenset[str]] = frozenset( + {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"} ) + +# DEFAULT_REGEX_FLAVOR: Final[RegexFlavor] = RegexFlavor.RE diff --git a/src/regex_toolkit/enums.py b/src/regex_toolkit/enums.py index 55a9b9d..4f7c8cc 100644 --- a/src/regex_toolkit/enums.py +++ b/src/regex_toolkit/enums.py @@ -2,6 +2,11 @@ from enum import Enum +__all__ = [ + "ALL_REGEX_FLAVORS", + "RegexFlavor", +] + class RegexFlavor(int, Enum): """Regex flavors. @@ -13,3 +18,6 @@ class RegexFlavor(int, Enum): RE = 1 RE2 = 2 + + +ALL_REGEX_FLAVORS: list[RegexFlavor] = [RegexFlavor.RE, RegexFlavor.RE2] diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index a63c30e..c049951 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -1,28 +1,26 @@ import unicodedata from collections.abc import Generator, Iterable -from functools import lru_cache -from typing import NoReturn from regex_toolkit.enums import RegexFlavor __all__ = [ - "validate_regex_flavor", - "iter_sort_by_len", - "sort_by_len", - "ord_to_cpoint", - "cpoint_to_ord", + "char_range", "char_to_cpoint", - "to_utf8", - "to_nfc", + "cpoint_to_ord", "iter_char_range", - "char_range", + "iter_sort_by_len", "mask_span", "mask_spans", + "ord_to_cpoint", + "sort_by_len", + "to_nfc", + "to_utf8", + "validate_regex_flavor", ] -@lru_cache(maxsize=2) -def validate_regex_flavor(flavor: int) -> None | NoReturn: +# TODO: Could optimize speed with caching through lru_cache or mapping +def validate_regex_flavor(flavor: int) -> None: """Validate a regex flavor. Args: diff --git a/tests/test_base.py b/tests/test_base.py index 1e71d9e..0111e2b 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,578 +1,294 @@ import re import unittest +from collections.abc import Iterable from itertools import product import pytest import re2 import regex_toolkit -from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE +from regex_toolkit.constants import ( + ALWAYS_ESCAPE, + ALWAYS_SAFE, + RESERVED_EXPRESSIONS, +) from regex_toolkit.enums import RegexFlavor +NON_ASCII_CHARS = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + + +def _exp_will_match(exp: str, text: str, flavor: int) -> bool: + if flavor == 1: + return bool(re.search(exp, text)) + if flavor == 2: + return bool(re2.search(exp, text)) + raise ValueError(f"Invalid regex flavor: {flavor}") + + +def assert_exp_will_match(exp: str, text: str, flavor: int) -> bool: + assert _exp_will_match( + exp, text, flavor + ), f"RE{flavor} Pattern: {exp!r} does not match {text!r}" + + +def assert_exp_will_match_all(exp: str, texts: Iterable[str], flavor: int) -> bool: + for text in texts: + assert_exp_will_match(exp, text, flavor) + + +# RE and RE2 - Escape + + +@pytest.mark.parametrize("char, expected", [(char, char) for char in ALWAYS_SAFE]) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_escape_and_escape2_safe(char, expected, flavor): + actual = regex_toolkit.escape(char, flavor) + assert actual == expected + assert_exp_will_match(actual, char, flavor) + + +@pytest.mark.parametrize( + "char, expected_exp", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_escape_and_escape2_escapable(char, expected_exp, flavor): + actual = regex_toolkit.escape(char, flavor) + assert actual == expected_exp + assert_exp_will_match(actual, char, flavor) + + +# RE - Escape + + +@pytest.mark.parametrize( + "char, expected_exp", + [(char, f"\\{char}") for char in NON_ASCII_CHARS], +) +def test_escape_unknown(char, expected_exp): + actual = regex_toolkit.escape(char, RegexFlavor.RE) + assert actual == expected_exp + assert_exp_will_match(actual, char, RegexFlavor.RE) + + +# RE2 - Escape + + +@pytest.mark.parametrize( + "char, expected", + [ + (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}") + for char in NON_ASCII_CHARS + ], +) +def test_escape2_unknown(char, expected): + actual = regex_toolkit.escape(char, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, char, RegexFlavor.RE2) + + +def test_escape2_trimmed(): + text = "Β°" + expected = "\\x{00b0}" + actual = regex_toolkit.escape(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +def test_escape2_untrimmed(): + text = "πŸ…°" + expected = "\\x{0001f170}" + actual = regex_toolkit.escape(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +# RE and RE2 - String as expression + + +@pytest.mark.parametrize("text, expected", [(text, text) for text in ALWAYS_SAFE]) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_string_as_exp_and_exp2_safe_individual_char(text, expected, flavor): + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_string_as_exp_and_exp2_safe_joined_as_one(flavor): + text = "".join(ALWAYS_SAFE) + expected = "".join(ALWAYS_SAFE) + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize( + "text, expected", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_string_as_exp_and_exp2_escapable_individual_char(text, expected, flavor): + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_string_as_exp_and_exp2_escapable_joined_as_one(flavor): + text = "".join(ALWAYS_ESCAPE) + expected = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) -class TestEscapeRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - - def test_safe(self): - for char in ALWAYS_SAFE: - with self.subTest(char=char): - expected_exp = char - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_escapable(self): - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_unknown(self): - # TODO: Include additional characters to test. - for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - -class TestEscapeRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - - def test_safe(self): - for char in ALWAYS_SAFE: - with self.subTest(char=char): - expected_exp = char - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_escapable(self): - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_trimmed(self): - expected_exp = "\\x{00b0}" - actual_exp = regex_toolkit.escape("Β°", self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match("Β°")) - - def test_untrimmed(self): - expected_exp = "\\x{0001f170}" - actual_exp = regex_toolkit.escape("πŸ…°", self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match("πŸ…°")) - - def test_unknown(self): - # TODO: Include additional characters to test. - # TODO: Cover chars that would be trimmed. - # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8).removeprefix("0000") + "}" - for char, expected_exp in ( - # Length 1 - ("πŸ…°", "\\x{0001f170}"), - ("πŸ…±", "\\x{0001f171}"), - ("πŸ…Ύ", "\\x{0001f17e}"), - ("πŸ…Ώ", "\\x{0001f17f}"), - ("πŸ†Ž", "\\x{0001f18e}"), - ("πŸ†‘", "\\x{0001f191}"), - ("πŸ†’", "\\x{0001f192}"), - ("πŸ†“", "\\x{0001f193}"), - ("πŸ†”", "\\x{0001f194}"), - ("πŸ†•", "\\x{0001f195}"), - ("πŸ†–", "\\x{0001f196}"), - ("πŸ†—", "\\x{0001f197}"), - ("πŸ†˜", "\\x{0001f198}"), - ("πŸ†™", "\\x{0001f199}"), - ("πŸ†š", "\\x{0001f19a}"), - ("πŸ‡¦", "\\x{0001f1e6}"), - ("πŸ‡§", "\\x{0001f1e7}"), - ("πŸ‡¨", "\\x{0001f1e8}"), - ("🈁", "\\x{0001f201}"), - ("πŸˆ‚", "\\x{0001f202}"), - ("🈚", "\\x{0001f21a}"), - ("🈯", "\\x{0001f22f}"), - ("🈲", "\\x{0001f232}"), - ("🈳", "\\x{0001f233}"), - ("🈴", "\\x{0001f234}"), - ("🈡", "\\x{0001f235}"), - ("🈢", "\\x{0001f236}"), - ("🈷", "\\x{0001f237}"), - ("🈸", "\\x{0001f238}"), - ("🈹", "\\x{0001f239}"), - ("🈺", "\\x{0001f23a}"), - ("πŸ‰", "\\x{0001f250}"), - ("πŸ‰‘", "\\x{0001f251}"), - ("πŸŒ€", "\\x{0001f300}"), - ("🌁", "\\x{0001f301}"), - ("πŸŒ‚", "\\x{0001f302}"), - ("πŸŒƒ", "\\x{0001f303}"), - ("πŸŒ„", "\\x{0001f304}"), - # Length 2 - ("πŸŒ…", "\\x{0001f305}"), - ): - with self.subTest(char=char): - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - -class TestStringAsExpressionRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - - def test_safe_individual_char(self): - # Single character. - for char in ALWAYS_SAFE: - with self.subTest(char=char): - text = char - expected_exp = char - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - # TODO: Add tests for mix of characters. - def test_safe_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_SAFE) - expected_exp = text - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_individual_char(self): - # Single character. - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - text = char - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_ESCAPE) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_unsafe_joined_as_one(self): - # All characters. - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - expected_exp = "".join(f"\\{char}" for char in text) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - -class TestStringAsExpressionRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - - # TODO: Add tests for mix of characters. - def test_safe_individual_char(self): - # Single character. - for char in ALWAYS_SAFE: - with self.subTest(char=char): - text = char - expected_exp = char - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_safe_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_SAFE) - expected_exp = "".join(ALWAYS_SAFE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_individual_char(self): - # Single character. - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - text = char - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_ESCAPE) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_unknown_joined_as_one(self): - # TODO: Include additional characters to test. - # TODO: Cover chars that would be trimmed. - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - expected_exp = r"".join( - ( - "\\x{0001f170}", - "\\x{0001f171}", - "\\x{0001f17e}", - "\\x{0001f17f}", - "\\x{0001f18e}", - "\\x{0001f191}", - "\\x{0001f192}", - "\\x{0001f193}", - "\\x{0001f194}", - "\\x{0001f195}", - "\\x{0001f196}", - "\\x{0001f197}", - "\\x{0001f198}", - "\\x{0001f199}", - "\\x{0001f19a}", - "\\x{0001f1e6}", - "\\x{0001f1e7}", - "\\x{0001f1e8}", - "\\x{0001f201}", - "\\x{0001f202}", - "\\x{0001f21a}", - "\\x{0001f22f}", - "\\x{0001f232}", - "\\x{0001f233}", - "\\x{0001f234}", - "\\x{0001f235}", - "\\x{0001f236}", - "\\x{0001f237}", - "\\x{0001f238}", - "\\x{0001f239}", - "\\x{0001f23a}", - "\\x{0001f250}", - "\\x{0001f251}", - "\\x{0001f300}", - "\\x{0001f301}", - "\\x{0001f302}", - "\\x{0001f303}", - "\\x{0001f304}", - # Length 2 - "\\x{0001f305}", - ) + +# RE - String as expression + + +@pytest.mark.parametrize( + "text, expected", + [(text, f"\\{text}") for text in NON_ASCII_CHARS], +) +def test_string_as_exp_unsafe_individual_char(text, expected): + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE) + + +def test_string_as_exp_unsafe_joined_as_one(): + text = NON_ASCII_CHARS + expected = "".join(f"\\{char}" for char in text) + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE) + + +# RE2 - String as expression + + +@pytest.mark.parametrize( + "text, expected", + [ + (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}") + for char in NON_ASCII_CHARS + ], +) +def test_string_as_exp2_unknown_individual_char(text, expected): + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +def test_string_as_exp2_unknown_joined_as_one(): + text = NON_ASCII_CHARS + expected = "".join( + "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}" + for char in text + ) + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +# RE and RE2 - Strings as expression + + +@pytest.mark.parametrize( + "texts, expected", + [(texts, r"|".join(texts)) for texts in product(ALWAYS_SAFE, repeat=2)], +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_strings_as_exp_and_exp2_safe_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(ALWAYS_ESCAPE, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_strings_as_exp_and_exp2_escapable_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(RESERVED_EXPRESSIONS, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_strings_as_exp_and_exp2_reserved_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + ( + texts, + r"|".join(text if text in ALWAYS_SAFE else f"\\{text}" for text in texts), ) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) + for texts in product(ALWAYS_SAFE | ALWAYS_ESCAPE, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +def test_strings_as_exp_and_exp2_safe_and_escapable_of_various_lengths( + texts, expected, flavor +): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) -RESERVED_EXPRESSIONS = frozenset( - {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"} +# RE - Strings as expression + + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(NON_ASCII_CHARS, repeat=2) + ], +) +def test_strings_as_exp_unsafe_of_various_lengths(texts, expected): + actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match_all(actual, texts, RegexFlavor.RE) + + +# RE2 - Strings as expression + + +@pytest.mark.parametrize( + "texts, expected", + [ + ( + texts, + r"|".join( + "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}" + for char in texts + ), + ) + for texts in product(NON_ASCII_CHARS, repeat=2) + ], ) +def test_strings_as_exp2_unsafe_of_various_lengths(texts, expected): + actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match_all(actual, texts, RegexFlavor.RE2) -class StringsAsExpressionRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - self._max_combo_length = 2 - - def test_safe_of_various_lengths(self): - # Unique combinations of `ALWAYS_SAFE` using various lengths. - elements = ALWAYS_SAFE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_escapable_of_various_lengths(self): - # Unique combinations of `ALWAYS_ESCAPE` using various lengths. - elements = ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_reserved_of_various_lengths(self): - # Unique combinations of reserved expressions using various lengths. - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'n', not r'\n' - elements = RESERVED_EXPRESSIONS - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_unsafe_of_various_lengths(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of `ALWAYS_SAFE` using various lengths. - elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_safe_and_escapable_of_various_lengths(self): - # Unique combinations of `ALWAYS_SAFE` and `ALWAYS_ESCAPE` using various lengths. - elements = ALWAYS_SAFE | ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join( - text if text in ALWAYS_SAFE else f"\\{text}" for text in texts - ) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - # def test_actual_examples(self): - # - - -###################### -###################### # Multiple unsafe char -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp([".", "!", "?"], self._flavor), -###################### "\\.|\\!|\\?", -###################### ) -###################### -###################### for texts, expected_exp in [ -###################### (["πŸ…°"], "\\πŸ…°"), -###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), -###################### (["alpha", "beta"], "alpha|beta"), -###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), -###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), -###################### ]: -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp(texts, self._flavor), -###################### expected_exp, -###################### ) - - -class StringsAsExpressionRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - self._max_combo_length = 2 - - def test_safe_of_variable_lengths(self): - # Unique combinations of ALWAYS_SAFE using various lengths. - elements = set(ALWAYS_SAFE) - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_escapable_of_variable_lengths(self): - # Unique combinations of ALWAYS_ESCAPE using various lengths. - elements = ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_reserved_of_variable_lengths(self): - # Unique combinations of reserved expressions using various lengths. - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'n', not r'\n' - elements = RESERVED_EXPRESSIONS - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_unsafe_of_variable_lengths(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of ALWAYS_SAFE using various lengths. - elements_map = { - # Length 1 - "πŸ…°": "\\x{0001f170}", - "πŸ…±": "\\x{0001f171}", - "πŸ…Ύ": "\\x{0001f17e}", - "πŸ…Ώ": "\\x{0001f17f}", - "πŸ†Ž": "\\x{0001f18e}", - "πŸ†‘": "\\x{0001f191}", - "πŸ†’": "\\x{0001f192}", - "πŸ†“": "\\x{0001f193}", - "πŸ†”": "\\x{0001f194}", - "πŸ†•": "\\x{0001f195}", - "πŸ†–": "\\x{0001f196}", - "πŸ†—": "\\x{0001f197}", - "πŸ†˜": "\\x{0001f198}", - "πŸ†™": "\\x{0001f199}", - "πŸ†š": "\\x{0001f19a}", - "πŸ‡¦": "\\x{0001f1e6}", - "πŸ‡§": "\\x{0001f1e7}", - "πŸ‡¨": "\\x{0001f1e8}", - "🈁": "\\x{0001f201}", - "πŸˆ‚": "\\x{0001f202}", - "🈚": "\\x{0001f21a}", - "🈯": "\\x{0001f22f}", - "🈲": "\\x{0001f232}", - "🈳": "\\x{0001f233}", - "🈴": "\\x{0001f234}", - "🈡": "\\x{0001f235}", - "🈢": "\\x{0001f236}", - "🈷": "\\x{0001f237}", - "🈸": "\\x{0001f238}", - "🈹": "\\x{0001f239}", - "🈺": "\\x{0001f23a}", - "πŸ‰": "\\x{0001f250}", - "πŸ‰‘": "\\x{0001f251}", - "πŸŒ€": "\\x{0001f300}", - "🌁": "\\x{0001f301}", - "πŸŒ‚": "\\x{0001f302}", - "πŸŒƒ": "\\x{0001f303}", - "πŸŒ„": "\\x{0001f304}", - # Length 2 - "πŸŒ…": "\\x{0001f305}", - } - elements = tuple(elements_map) - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(elements_map[text] for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - -##############################3 # Exact matches that equate to reserved spaces -##############################3 # E.g. Should match '\\' + 'n', not r'\n' -##############################3 for text in ("\\w", "\\W", "\\d", "\\D", "\\s", "\\S", "\\1"): -##############################3 texts = [text] -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 f"\\{text}", -##############################3 ) -##############################3 -##############################3 # Single whitespace char -##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), texts[0]) -##############################3 -##############################3 # Single unsafe char -##############################3 for texts, expected_exp in [ -##############################3 (["."], "\\."), -##############################3 (["!"], "\\!"), -##############################3 (["?"], "\\?"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 expected_exp, -##############################3 ) -##############################3 -##############################3 # Multiple unsafe char -##############################3 texts = [".", "!", "?"] -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), "\\.|\\!|\\?") -##############################3 -##############################3 for texts, expected_exp in [ -##############################3 (["πŸ…°"], "\\x{0001f170}"), -##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), -##############################3 (["alpha", "beta"], "alpha|beta"), -##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), -##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 expected_exp, -##############################3 ) - -# TODO: Add tests for actually compiling the e. +# Make expression @pytest.mark.parametrize( diff --git a/tests/test_enums.py b/tests/test_enums.py index 3af0dbd..d061d36 100644 --- a/tests/test_enums.py +++ b/tests/test_enums.py @@ -3,18 +3,23 @@ from regex_toolkit.enums import RegexFlavor -def test_regex_flavor_enum_is_int(): - assert isinstance(RegexFlavor.RE, int) - assert RegexFlavor.RE == 1 - assert RegexFlavor(1) == RegexFlavor.RE - assert isinstance(RegexFlavor.RE2, int) - assert RegexFlavor.RE2 == 2 - assert RegexFlavor(2) == RegexFlavor.RE2 +def test_regex_flavor_enum_has_expected_members(): + assert len(RegexFlavor) == 2 + assert len(set(RegexFlavor)) == len(RegexFlavor) + assert RegexFlavor.RE.name == "RE" + assert RegexFlavor.RE.value == RegexFlavor.RE == RegexFlavor(1) == 1 + assert RegexFlavor(1) is RegexFlavor.RE -def test_invalid_regex_flavor_raises_value_error(): - with pytest.raises(ValueError): - RegexFlavor(0) + assert RegexFlavor.RE2.name == "RE2" + assert RegexFlavor.RE2 == RegexFlavor.RE2.value == RegexFlavor(2) == 2 + assert RegexFlavor(2) is RegexFlavor.RE2 - with pytest.raises(ValueError): - RegexFlavor(3) + +@pytest.mark.parametrize("invalid_flavor", (0, 3)) +def test_invalid_regex_flavor_raises_value_error(invalid_flavor): + with pytest.raises( + ValueError, + match=f"^{invalid_flavor} is not a valid RegexFlavor$", + ): + RegexFlavor(invalid_flavor) diff --git a/tests/test_utils.py b/tests/test_utils.py index aceaf9b..4f022f0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,14 @@ @pytest.mark.parametrize("flavor", (1, 2)) -def test_validate_regex_flavor(flavor): +def test_validate_regex_flavor_when_int(flavor): + regex_toolkit.validate_regex_flavor(flavor) + + +@pytest.mark.parametrize( + "flavor", (1, 2, regex_toolkit.RegexFlavor.RE, regex_toolkit.RegexFlavor.RE2) +) +def test_validate_regex(flavor): regex_toolkit.validate_regex_flavor(flavor) @@ -32,81 +39,118 @@ def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: return True -class TestSortByLength(unittest.TestCase): - def setUp(self) -> None: - self.texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - self.texts_by_type = ( - (set, self.texts), - (frozenset, frozenset(self.texts)), - (tuple, tuple(self.texts)), - (list, list(self.texts)), - (dict, dict.fromkeys(self.texts, None)), - ) - - def test_iter_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, - typed_texts=typed_texts, - reverse=reverse, - ): - result = regex_toolkit.iter_sort_by_len( - typed_texts, - reverse=reverse, - ) - self.assertIsInstance(result, Generator) - result_tuple = tuple(result) - self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) - self.assertEqual( - result_tuple, - tuple(sorted(typed_texts, key=len, reverse=reverse)), - ) - - def test_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, - typed_texts=typed_texts, - reverse=reverse, - ): - result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) - self.assertIsInstance(result, tuple) - self.assertTrue(is_sorted_by_len(result, reverse=reverse)) - self.assertEqual( - result, - tuple(sorted(typed_texts, key=len, reverse=reverse)), - ) - - -class TestIterCharRange(unittest.TestCase): - def test_iter_char_range(self): - result = regex_toolkit.iter_char_range("a", "z") - self.assertIsInstance(result, Generator) - self.assertTupleEqual( - tuple(result), - tuple("abcdefghijklmnopqrstuvwxyz"), - ) - - def test_char_range(self): - result = regex_toolkit.char_range("a", "z") - self.assertIsInstance(result, tuple) - self.assertTupleEqual( - result, - tuple("abcdefghijklmnopqrstuvwxyz"), - ) +SORT_BY_LEN_TEXTS = [ + "apple", + "orange", + "banana", + "grape", + "apricot", + "cherry", + "plum", + "blueberry", + "strawberry", + "blackberry", +] +SORT_BY_LEN_TEXTS_BY_TYPE = { + set: set(SORT_BY_LEN_TEXTS), + frozenset: frozenset(SORT_BY_LEN_TEXTS), + tuple: tuple(SORT_BY_LEN_TEXTS), + list: list(SORT_BY_LEN_TEXTS), + dict: dict.fromkeys(SORT_BY_LEN_TEXTS, None), +} + + +@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items()) +@pytest.mark.parametrize("reverse", (False, True)) +def test_iter_sort_by_len(try_type, typed_texts, reverse): + expected_tuple = tuple(sorted(typed_texts, key=len, reverse=reverse)) + assert is_sorted_by_len(expected_tuple, reverse=reverse) + + actual = regex_toolkit.iter_sort_by_len(typed_texts, reverse=reverse) + actual_tuple = tuple(actual) + assert isinstance(actual, Generator) and (actual_tuple == expected_tuple), { + "try_type": try_type, + "typed_texts": typed_texts, + "reverse": reverse, + "actual_tuple": actual_tuple, + "expected_tuple": expected_tuple, + } + + +@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items()) +@pytest.mark.parametrize("reverse", (False, True)) +def test_sort_by_len(try_type, typed_texts, reverse): + expected = tuple(sorted(typed_texts, key=len, reverse=reverse)) + assert is_sorted_by_len(expected, reverse=reverse) + + actual = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) + assert isinstance(actual, tuple) and (actual == expected), { + "try_type": try_type, + "typed_texts": typed_texts, + "reverse": reverse, + "actual": actual, + "expected": expected, + } + + +ITER_CHAR_RANGE_CASES = [ + # Single char + (("a", "a"), ("a",)), + # Basic range + (("a", "d"), ("a", "b", "c", "d")), + # Reverse range + (("d", "a"), ("d", "c", "b", "a")), + # Single char (non-ASCII) + (("🐢", "🐺"), ("🐢", "🐷", "🐸", "🐹", "🐺")), +] + + +@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES) +def test_char_range(char_range, expected): + actual = regex_toolkit.char_range(*char_range) + assert isinstance(actual, tuple) + assert actual == expected, { + "char_range": char_range, + "actual": actual, + "expected": expected, + } + + +@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES) +def test_iter_char_range(char_range, expected): + actual = regex_toolkit.iter_char_range(*char_range) + assert isinstance(actual, Generator) + actual_tuple = tuple(actual) + assert actual_tuple == expected, { + "char_range": char_range, + "actual_tuple": actual_tuple, + "expected": expected, + } + + +@pytest.mark.parametrize( + "text, expected", + ( + # Empty string + ("", ""), + # Already NFC + ("a", "a"), + # Already NFC (non-ASCII) + ("🐢🐾", "🐢🐾"), + # Basic combining char (acute accent) + ("a\u0301", "Γ‘"), + # Multiple combining chars (diaeresis and acute accent) + ("o\u0308\u0301", "ấ"), + ), +) +def test_to_nfc(text, expected): + actual = regex_toolkit.to_nfc(text) + assert isinstance(actual, str) + assert actual == expected, { + "text": text, + "actual": actual, + "expected": expected, + } class TestMasking(unittest.TestCase): From 5b7b7709818e07db3d622d121bbed7af0e78e9a0 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 20 Aug 2023 00:42:15 -0500 Subject: [PATCH 10/16] Including requirements-test.txt for convenience --- .gitignore | 1 + requirements-test.txt | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 requirements-test.txt diff --git a/.gitignore b/.gitignore index 5013d26..b04526a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ !environment.yml !codecov.yml !requirements-doc.txt +!requirements-test.txt !src/ !src/* diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..2204b77 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,4 @@ +pytest>=7.0.0 +pytest-cov +pytest-xdist>=2.2.0 +# pytest-asyncio>=0.17 From 3d3b24f9a036a64e3fb9d42b16645b4b93b1fde9 Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 24 Aug 2023 18:08:49 -0500 Subject: [PATCH 11/16] Use full match in test instead of search --- tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index 0111e2b..b8cb047 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -19,9 +19,9 @@ def _exp_will_match(exp: str, text: str, flavor: int) -> bool: if flavor == 1: - return bool(re.search(exp, text)) + return bool(re.fullmatch(exp, text)) if flavor == 2: - return bool(re2.search(exp, text)) + return bool(re2.fullmatch(exp, text)) raise ValueError(f"Invalid regex flavor: {flavor}") From fed836455c083e347e2cf74881c7ae1d45d2b419 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 26 Aug 2023 02:44:11 -0500 Subject: [PATCH 12/16] Short-term implementation adding ability to change default regex flavor at any point --- src/regex_toolkit/__init__.py | 2 -- src/regex_toolkit/base.py | 67 ++++++++++++----------------------- src/regex_toolkit/utils.py | 39 +++++++++++++++----- tests/test_base.py | 39 +++++++++++--------- tests/test_utils.py | 42 +++++++++++++++------- 5 files changed, 105 insertions(+), 84 deletions(-) diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index c3f8a79..7abf3ae 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -17,7 +17,6 @@ sort_by_len, to_nfc, to_utf8, - validate_regex_flavor, ) __version__ = "0.0.5" @@ -39,5 +38,4 @@ "strings_as_exp", "to_nfc", "to_utf8", - "validate_regex_flavor", ] diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index e116b74..9d0772a 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -4,15 +4,14 @@ "strings_as_exp", "make_exp", ] -from collections.abc import Callable, Iterable -from typing import Final +from collections.abc import Iterable from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor from regex_toolkit.utils import ( char_to_cpoint, iter_sort_by_len, - validate_regex_flavor, + resolve_flavor, ) @@ -37,18 +36,12 @@ def _escape2(char: str) -> str: return "\\x{" + char_to_cpoint(char).removeprefix("0000") + "}" -_ESCAPE_FUNC_MAP: Final[dict[int, Callable]] = { - RegexFlavor.RE: _escape, - RegexFlavor.RE2: _escape2, -} - - -def escape(char: str, flavor: int = 1) -> str: +def escape(char: str, flavor: int | None = None) -> str: """Create a regex expression that exactly matches a character. Args: char (str): Character to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: str: Expression that exactly matches the original character. @@ -56,8 +49,9 @@ def escape(char: str, flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - validate_regex_flavor(flavor) - return _ESCAPE_FUNC_MAP[flavor](char) + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + return _escape(char) + return _escape2(char) def _string_as_exp(text: str) -> str: @@ -68,18 +62,12 @@ def _string_as_exp2(text: str) -> str: return r"".join(map(_escape2, text)) -_STRING_AS_EXP_FUNC_MAP: Final[dict[int, Callable]] = { - RegexFlavor.RE: _string_as_exp, - RegexFlavor.RE2: _string_as_exp2, -} - - -def string_as_exp(text: str, flavor: int = 1) -> str: +def string_as_exp(text: str, flavor: int | None = None) -> str: """Create a regex expression that exactly matches a string. Args: text (str): String to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: str: Expression that exactly matches the original string. @@ -87,8 +75,9 @@ def string_as_exp(text: str, flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - validate_regex_flavor(flavor) - return _STRING_AS_EXP_FUNC_MAP[flavor](text) + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + return _string_as_exp(text) + return _string_as_exp2(text) def _strings_as_exp(texts: Iterable[str]) -> str: @@ -99,18 +88,12 @@ def _strings_as_exp2(texts: Iterable[str]) -> str: return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) -_STRINGS_AS_EXP_FUNC_MAP: Final[dict[int, Callable]] = { - RegexFlavor.RE: _strings_as_exp, - RegexFlavor.RE2: _strings_as_exp2, -} - - -def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: +def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str: """Create a regex expression that exactly matches any one string. Args: texts (Iterable[str]): Strings to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: str: Expression that exactly matches any one of the original strings. @@ -118,18 +101,17 @@ def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - validate_regex_flavor(flavor) - return _STRINGS_AS_EXP_FUNC_MAP[flavor](texts) + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + return _strings_as_exp(texts) + return _strings_as_exp2(texts) def _make_group_exp(group: list[int]) -> str: if len(group) > 2: # Represent as a character range - print(f"{group = }") return _escape(chr(group[0])) + "-" + _escape(chr(group[-1])) else: # Represent as individual characters - print(f"{group = }") return "".join((_escape(chr(char_ord)) for char_ord in group)) @@ -142,13 +124,7 @@ def _make_group_exp2(group: list[int]) -> str: return "".join((_escape2(chr(char_ord)) for char_ord in group)) -_MAKE_GROUP_EXP_FUNC_MAP: Final[dict[int, Callable]] = { - RegexFlavor.RE: _make_group_exp, - RegexFlavor.RE2: _make_group_exp2, -} - - -def make_exp(chars: Iterable[str], flavor: int = 1) -> str: +def make_exp(chars: Iterable[str], flavor: int | None = None) -> str: """Create a regex expression that exactly matches a list of characters. The characters are sorted and grouped into ranges where possible. @@ -163,7 +139,7 @@ def make_exp(chars: Iterable[str], flavor: int = 1) -> str: Args: chars (Iterable[str]): Characters to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: str: Expression that exactly matches the original characters. @@ -171,8 +147,9 @@ def make_exp(chars: Iterable[str], flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - validate_regex_flavor(flavor) - func = _MAKE_GROUP_EXP_FUNC_MAP[flavor] + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + func = _make_group_exp + func = _make_group_exp2 exp = "" group = [] diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index c049951..a4954b0 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -7,32 +7,55 @@ "char_range", "char_to_cpoint", "cpoint_to_ord", + "default_flavor", "iter_char_range", "iter_sort_by_len", "mask_span", "mask_spans", "ord_to_cpoint", + "resolve_flavor", "sort_by_len", "to_nfc", "to_utf8", - "validate_regex_flavor", ] +default_flavor: int | RegexFlavor | None = RegexFlavor.RE -# TODO: Could optimize speed with caching through lru_cache or mapping -def validate_regex_flavor(flavor: int) -> None: - """Validate a regex flavor. + +def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor: + """Resolve a regex flavor. + + If the flavor is an integer, it is validated and returned. + If the flavor is a RegexFlavor, it is returned. + If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`. + + ```python + import regex_toolkit as rtk + + rtk.utils.default_flavor = rtk.enums.RegexFlavor.RE2 + assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 + ``` Args: - flavor (int): Regex flavor (1 for RE, 2 for RE2). + potential_flavor (int | RegexFlavor | None): Potential regex flavor. + + Returns: + RegexFlavor: Resolved regex flavor. Raises: ValueError: Invalid regex flavor. """ try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") + return RegexFlavor(potential_flavor) + except ValueError as err: + global default_flavor + if default_flavor is not None: + try: + return RegexFlavor(default_flavor) + except ValueError as err: + raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err + else: + raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err def iter_sort_by_len( diff --git a/tests/test_base.py b/tests/test_base.py index b8cb047..4d1ac4e 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,5 +1,5 @@ +# import random import re -import unittest from collections.abc import Iterable from itertools import product @@ -12,8 +12,13 @@ ALWAYS_SAFE, RESERVED_EXPRESSIONS, ) -from regex_toolkit.enums import RegexFlavor +from regex_toolkit.enums import ALL_REGEX_FLAVORS, RegexFlavor +# TODO: Change to cover a more diverse set of non-ASCII characters? +# RANDOM_SAMPLE_SIZE = 50 +# NON_ASCII_CHARS = [chr(i) for i in range(0x0000, 0xFFFF) if not chr(i).isascii()] +# random.shuffle(NON_ASCII_CHARS) +# NON_ASCII_CHARS = NON_ASCII_CHARS[:RANDOM_SAMPLE_SIZE] NON_ASCII_CHARS = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" @@ -22,7 +27,7 @@ def _exp_will_match(exp: str, text: str, flavor: int) -> bool: return bool(re.fullmatch(exp, text)) if flavor == 2: return bool(re2.fullmatch(exp, text)) - raise ValueError(f"Invalid regex flavor: {flavor}") + raise ValueError(f"Invalid regex flavor: {flavor!r}") def assert_exp_will_match(exp: str, text: str, flavor: int) -> bool: @@ -40,7 +45,7 @@ def assert_exp_will_match_all(exp: str, texts: Iterable[str], flavor: int) -> bo @pytest.mark.parametrize("char, expected", [(char, char) for char in ALWAYS_SAFE]) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_escape_and_escape2_safe(char, expected, flavor): actual = regex_toolkit.escape(char, flavor) assert actual == expected @@ -50,7 +55,7 @@ def test_escape_and_escape2_safe(char, expected, flavor): @pytest.mark.parametrize( "char, expected_exp", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_escape_and_escape2_escapable(char, expected_exp, flavor): actual = regex_toolkit.escape(char, flavor) assert actual == expected_exp @@ -106,14 +111,14 @@ def test_escape2_untrimmed(): @pytest.mark.parametrize("text, expected", [(text, text) for text in ALWAYS_SAFE]) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_string_as_exp_and_exp2_safe_individual_char(text, expected, flavor): actual = regex_toolkit.string_as_exp(text, flavor) assert actual == expected assert_exp_will_match(actual, text, flavor) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_string_as_exp_and_exp2_safe_joined_as_one(flavor): text = "".join(ALWAYS_SAFE) expected = "".join(ALWAYS_SAFE) @@ -125,14 +130,14 @@ def test_string_as_exp_and_exp2_safe_joined_as_one(flavor): @pytest.mark.parametrize( "text, expected", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_string_as_exp_and_exp2_escapable_individual_char(text, expected, flavor): actual = regex_toolkit.string_as_exp(text, flavor) assert actual == expected assert_exp_will_match(actual, text, flavor) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_string_as_exp_and_exp2_escapable_joined_as_one(flavor): text = "".join(ALWAYS_ESCAPE) expected = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) @@ -155,7 +160,7 @@ def test_string_as_exp_unsafe_individual_char(text, expected): def test_string_as_exp_unsafe_joined_as_one(): - text = NON_ASCII_CHARS + text = "".join(NON_ASCII_CHARS) expected = "".join(f"\\{char}" for char in text) actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE) assert actual == expected @@ -179,7 +184,7 @@ def test_string_as_exp2_unknown_individual_char(text, expected): def test_string_as_exp2_unknown_joined_as_one(): - text = NON_ASCII_CHARS + text = "".join(NON_ASCII_CHARS) expected = "".join( "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}" for char in text @@ -196,7 +201,7 @@ def test_string_as_exp2_unknown_joined_as_one(): "texts, expected", [(texts, r"|".join(texts)) for texts in product(ALWAYS_SAFE, repeat=2)], ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_strings_as_exp_and_exp2_safe_of_various_lengths(texts, expected, flavor): actual = regex_toolkit.strings_as_exp(texts, flavor) assert actual == expected @@ -210,7 +215,7 @@ def test_strings_as_exp_and_exp2_safe_of_various_lengths(texts, expected, flavor for texts in product(ALWAYS_ESCAPE, repeat=2) ], ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_strings_as_exp_and_exp2_escapable_of_various_lengths(texts, expected, flavor): actual = regex_toolkit.strings_as_exp(texts, flavor) assert actual == expected @@ -224,7 +229,7 @@ def test_strings_as_exp_and_exp2_escapable_of_various_lengths(texts, expected, f for texts in product(RESERVED_EXPRESSIONS, repeat=2) ], ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_strings_as_exp_and_exp2_reserved_of_various_lengths(texts, expected, flavor): actual = regex_toolkit.strings_as_exp(texts, flavor) assert actual == expected @@ -241,7 +246,7 @@ def test_strings_as_exp_and_exp2_reserved_of_various_lengths(texts, expected, fl for texts in product(ALWAYS_SAFE | ALWAYS_ESCAPE, repeat=2) ], ) -@pytest.mark.parametrize("flavor", [RegexFlavor.RE, RegexFlavor.RE2]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) def test_strings_as_exp_and_exp2_safe_and_escapable_of_various_lengths( texts, expected, flavor ): @@ -279,7 +284,7 @@ def test_strings_as_exp_unsafe_of_various_lengths(texts, expected): for char in texts ), ) - for texts in product(NON_ASCII_CHARS, repeat=2) + for texts in product(*NON_ASCII_CHARS, repeat=2) ], ) def test_strings_as_exp2_unsafe_of_various_lengths(texts, expected): @@ -315,4 +320,4 @@ def test_strings_as_exp2_unsafe_of_various_lengths(texts, expected): ), ) def test_make_exp(chars, expected): - assert regex_toolkit.make_exp(chars) == expected + assert regex_toolkit.make_exp(chars, RegexFlavor.RE) == expected diff --git a/tests/test_utils.py b/tests/test_utils.py index 4f022f0..463ba04 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,27 +1,45 @@ import unittest from collections.abc import Generator, Iterable +from unittest import mock import pytest import regex_toolkit - - -@pytest.mark.parametrize("flavor", (1, 2)) -def test_validate_regex_flavor_when_int(flavor): - regex_toolkit.validate_regex_flavor(flavor) +from regex_toolkit.enums import RegexFlavor @pytest.mark.parametrize( - "flavor", (1, 2, regex_toolkit.RegexFlavor.RE, regex_toolkit.RegexFlavor.RE2) + "potential_flavor, expected", + [ + (1, RegexFlavor.RE), + (2, RegexFlavor.RE2), + (RegexFlavor.RE, RegexFlavor.RE), + (RegexFlavor.RE2, RegexFlavor.RE2), + (RegexFlavor(1), RegexFlavor.RE), + (RegexFlavor(2), RegexFlavor.RE2), + ], ) -def test_validate_regex(flavor): - regex_toolkit.validate_regex_flavor(flavor) +def test_resolve_flavor_with_valid(potential_flavor, expected): + assert regex_toolkit.base.resolve_flavor(potential_flavor) == expected + + +@mock.patch("regex_toolkit.utils.default_flavor", None) +def test_resolve_flavor_with_invalid_and_with_no_default_raises_value_error(): + with pytest.raises(ValueError, match=r"^Invalid regex flavor: None$"): + regex_toolkit.base.resolve_flavor(None) + + +@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) +@mock.patch("regex_toolkit.utils.default_flavor", RegexFlavor.RE) +def test_resolve_flavor_falls_back_to_default(potential_flavor): + regex_toolkit.base.resolve_flavor(potential_flavor) == RegexFlavor.RE -@pytest.mark.parametrize("flavor", (0, 3)) -def test_validate_regex_flavor_invalid(flavor): - with pytest.raises(ValueError, match=r"^Invalid regex flavor: \d+$"): - regex_toolkit.validate_regex_flavor(flavor) +@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) +@mock.patch("regex_toolkit.utils.default_flavor", None) +def test_resolve_flavor_invalid_int_without_default_raises(potential_flavor): + with pytest.raises(ValueError, match=r"^Invalid regex flavor: (None|'?\d'?)$"): + regex_toolkit.base.resolve_flavor(potential_flavor) def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: From 5edd6eb8c83fab3b791b1d0fdf50885b2dcfad11 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 26 Aug 2023 02:44:35 -0500 Subject: [PATCH 13/16] Update readme --- README.md | 82 ++++++++++++++++++++++++++++------- docs/templates/usage.md.jinja | 43 ++++++++++++++++-- 2 files changed, 107 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 7287d88..15058e8 100644 --- a/README.md +++ b/README.md @@ -57,18 +57,55 @@ python3 -m pip install -e . ## Usage -Import packages: +To harness the toolkit's capabilities, you should import the necessary packages: ```python import re # and/or import re2 +import regex_toolkit as rtk ``` +For instance, if you wish to create a regex pattern that matches all unicode letters and marks, and supplement it with additional code points consistent with Objective-C, you can do it as follows: + ```python -import regex_toolkit +unicode_letters_and_marks = r"\p{L}\p{M}" + rtk.make_exp( + [ + *rtk.char_range("\uf870", "\uf87f"), + "\uf882", + *rtk.char_range("\uf884", "\uf89f"), + "\uf8b8", + *rtk.char_range("\uf8c1", "\uf8d6"), + ], + flavor=2, +) +# Output: r'\p{L}\p{M}\x{f870}-\x{f87f}\x{f882}\x{f884}-\x{f89f}\x{f8b8}\x{f8c1}-\x{f8d6}' ``` +This representation is more intuitive and maintainable than conventional methods. + +### Why Use `regex_toolkit`? + +Standard unicode regex groups (like `\p{L}` and `\p{M}`) have definitions that vary across languages and versions. By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. + +### RE2 Overview + +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. It ensures both linear match time and efficient memory usage. Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. + +A brief rundown of RE2 terminology: + +- **BitState**: An execution engine that uses backtracking search. +- **bytecode**: The set of instructions that form an automaton. +- **DFA**: The engine for Deterministic Finite Automaton searches. +- **NFA**: Implements the Nondeterministic Finite Automaton search method. +- **OnePass**: A one-pass search execution engine. +- **pattern**: The textual form of a regex. +- **Prog**: The compiled version of a regex. +- **Regexp**: The parsed version of a regex. +- **Rune**: A character in terms of encoding, essentially a code point. + +For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary). + --- ## Library @@ -77,19 +114,34 @@ import regex_toolkit # `regex_toolkit.utils` - + -#### `validate_regex_flavor` +#### `resolve_flavor` ```python -def validate_regex_flavor(flavor: int) -> None +def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor ``` -Validate a regex flavor. +Resolve a regex flavor. + +If the flavor is an integer, it is validated and returned. +If the flavor is a RegexFlavor, it is returned. +If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`. + +```python +import regex_toolkit as rtk + +rtk.utils.default_flavor = rtk.enums.RegexFlavor.RE2 +assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 +``` **Arguments**: -- `flavor` _int_ - Regex flavor (1 for RE, 2 for RE2). +- `potential_flavor` _int | RegexFlavor | None_ - Potential regex flavor. + +**Returns**: + +- _RegexFlavor_ - Resolved regex flavor. **Raises**: @@ -348,7 +400,7 @@ Todo: Add support for overlapping (and unordered?) spans. #### `escape` ```python -def escape(char: str, flavor: int = 1) -> str +def escape(char: str, flavor: int | None = None) -> str ``` Create a regex expression that exactly matches a character. @@ -356,7 +408,7 @@ Create a regex expression that exactly matches a character. **Arguments**: - `char` _str_ - Character to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -371,7 +423,7 @@ Create a regex expression that exactly matches a character. #### `string_as_exp` ```python -def string_as_exp(text: str, flavor: int = 1) -> str +def string_as_exp(text: str, flavor: int | None = None) -> str ``` Create a regex expression that exactly matches a string. @@ -379,7 +431,7 @@ Create a regex expression that exactly matches a string. **Arguments**: - `text` _str_ - String to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -394,7 +446,7 @@ Create a regex expression that exactly matches a string. #### `strings_as_exp` ```python -def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str +def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str ``` Create a regex expression that exactly matches any one string. @@ -402,7 +454,7 @@ Create a regex expression that exactly matches any one string. **Arguments**: - `texts` _Iterable[str]_ - Strings to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -417,7 +469,7 @@ Create a regex expression that exactly matches any one string. #### `make_exp` ```python -def make_exp(chars: Iterable[str], flavor: int = 1) -> str +def make_exp(chars: Iterable[str], flavor: int | None = None) -> str ``` Create a regex expression that exactly matches a list of characters. @@ -435,7 +487,7 @@ exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]" **Arguments**: - `chars` _Iterable[str]_ - Characters to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja index 393c0d8..458c5ca 100644 --- a/docs/templates/usage.md.jinja +++ b/docs/templates/usage.md.jinja @@ -1,11 +1,48 @@ -Import packages: +To harness the toolkit's capabilities, you should import the necessary packages: ```python import re # and/or import re2 +import regex_toolkit as rtk ``` +For instance, if you wish to create a regex pattern that matches all unicode letters and marks, and supplement it with additional code points consistent with Objective-C, you can do it as follows: + ```python -import regex_toolkit -``` \ No newline at end of file +unicode_letters_and_marks = r"\p{L}\p{M}" + rtk.make_exp( + [ + *rtk.char_range("\uf870", "\uf87f"), + "\uf882", + *rtk.char_range("\uf884", "\uf89f"), + "\uf8b8", + *rtk.char_range("\uf8c1", "\uf8d6"), + ], + flavor=2, +) +# Output: r'\p{L}\p{M}\x{f870}-\x{f87f}\x{f882}\x{f884}-\x{f89f}\x{f8b8}\x{f8c1}-\x{f8d6}' +``` + +This representation is more intuitive and maintainable than conventional methods. + +### Why Use `regex_toolkit`? + +Standard unicode regex groups (like `\p{L}` and `\p{M}`) have definitions that vary across languages and versions. By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. + +### RE2 Overview + +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. It ensures both linear match time and efficient memory usage. Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. + +A brief rundown of RE2 terminology: + +- **BitState**: An execution engine that uses backtracking search. +- **bytecode**: The set of instructions that form an automaton. +- **DFA**: The engine for Deterministic Finite Automaton searches. +- **NFA**: Implements the Nondeterministic Finite Automaton search method. +- **OnePass**: A one-pass search execution engine. +- **pattern**: The textual form of a regex. +- **Prog**: The compiled version of a regex. +- **Regexp**: The parsed version of a regex. +- **Rune**: A character in terms of encoding, essentially a code point. + +For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary). \ No newline at end of file From 87b7f9e5af3d3e7ce64aca0b805fbfc9c3737f39 Mon Sep 17 00:00:00 2001 From: yaphott Date: Tue, 12 Sep 2023 18:36:45 -0500 Subject: [PATCH 14/16] RE2 should only be required for testing --- ci/deps/actions-310.yml | 3 --- ci/deps/actions-311.yml | 3 --- environment.yml | 3 --- pyproject.toml | 2 +- requirements-test.txt | 2 +- 5 files changed, 2 insertions(+), 11 deletions(-) diff --git a/ci/deps/actions-310.yml b/ci/deps/actions-310.yml index a3a6672..8bd2b2a 100644 --- a/ci/deps/actions-310.yml +++ b/ci/deps/actions-310.yml @@ -8,8 +8,5 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/ci/deps/actions-311.yml b/ci/deps/actions-311.yml index 4a16510..b816df2 100644 --- a/ci/deps/actions-311.yml +++ b/ci/deps/actions-311.yml @@ -8,8 +8,5 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/environment.yml b/environment.yml index 367bdfc..09760df 100644 --- a/environment.yml +++ b/environment.yml @@ -10,10 +10,7 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - coverage - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/pyproject.toml b/pyproject.toml index fe46401..3806815 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["google-re2>=1.0"] +dependencies = [] dynamic = ["version"] [project.urls] diff --git a/requirements-test.txt b/requirements-test.txt index 2204b77..bed0b62 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ pytest>=7.0.0 pytest-cov pytest-xdist>=2.2.0 -# pytest-asyncio>=0.17 +google-re2>=1.0 From ef5d5f3da4f45af77031d161acbb18b5244a8f9f Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 16 Sep 2023 09:43:23 -0500 Subject: [PATCH 15/16] Move `default_flavor` from `regex_toolkit.utils` to `regex_tookit.base` and add test for changing the default. --- README.md | 2 +- src/regex_toolkit/__init__.py | 2 +- src/regex_toolkit/base.py | 3 +++ src/regex_toolkit/constants.py | 3 --- src/regex_toolkit/utils.py | 11 ++++------- tests/test_base.py | 5 ----- tests/test_utils.py | 12 +++++++++--- 7 files changed, 18 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 15058e8..e2a46ff 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ If the flavor is None, the default flavor is returned. To change the default fla ```python import regex_toolkit as rtk -rtk.utils.default_flavor = rtk.enums.RegexFlavor.RE2 +rtk.base.default_flavor = 2 assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 ``` diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index 7abf3ae..7303ef7 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -22,10 +22,10 @@ __version__ = "0.0.5" __all__ = [ - "escape", "char_range", "char_to_cpoint", "cpoint_to_ord", + "escape", "iter_char_range", "iter_sort_by_len", "make_exp", diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 9d0772a..d59aea2 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -1,4 +1,5 @@ __all__ = [ + "default_flavor", "escape", "string_as_exp", "strings_as_exp", @@ -14,6 +15,8 @@ resolve_flavor, ) +default_flavor: int | RegexFlavor | None = RegexFlavor.RE + def _escape(char: str) -> str: if char in ALWAYS_SAFE: diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index abd6b97..542a922 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -13,7 +13,6 @@ "ALWAYS_ESCAPE", "ALWAYS_SAFE", "ASCIILETTERS", - # "DEFAULT_REGEX_FLAVOR", "DIGITS", "RESERVED_EXPRESSIONS", ] @@ -29,5 +28,3 @@ RESERVED_EXPRESSIONS: Final[frozenset[str]] = frozenset( {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"} ) - -# DEFAULT_REGEX_FLAVOR: Final[RegexFlavor] = RegexFlavor.RE diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index a4954b0..81a35d5 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -1,13 +1,13 @@ import unicodedata from collections.abc import Generator, Iterable +import regex_toolkit.base from regex_toolkit.enums import RegexFlavor __all__ = [ "char_range", "char_to_cpoint", "cpoint_to_ord", - "default_flavor", "iter_char_range", "iter_sort_by_len", "mask_span", @@ -19,8 +19,6 @@ "to_utf8", ] -default_flavor: int | RegexFlavor | None = RegexFlavor.RE - def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor: """Resolve a regex flavor. @@ -32,7 +30,7 @@ def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor: ```python import regex_toolkit as rtk - rtk.utils.default_flavor = rtk.enums.RegexFlavor.RE2 + rtk.base.default_flavor = 2 assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 ``` @@ -48,10 +46,9 @@ def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor: try: return RegexFlavor(potential_flavor) except ValueError as err: - global default_flavor - if default_flavor is not None: + if regex_toolkit.base.default_flavor is not None: try: - return RegexFlavor(default_flavor) + return RegexFlavor(regex_toolkit.base.default_flavor) except ValueError as err: raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err else: diff --git a/tests/test_base.py b/tests/test_base.py index 4d1ac4e..4020eae 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -14,11 +14,6 @@ ) from regex_toolkit.enums import ALL_REGEX_FLAVORS, RegexFlavor -# TODO: Change to cover a more diverse set of non-ASCII characters? -# RANDOM_SAMPLE_SIZE = 50 -# NON_ASCII_CHARS = [chr(i) for i in range(0x0000, 0xFFFF) if not chr(i).isascii()] -# random.shuffle(NON_ASCII_CHARS) -# NON_ASCII_CHARS = NON_ASCII_CHARS[:RANDOM_SAMPLE_SIZE] NON_ASCII_CHARS = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" diff --git a/tests/test_utils.py b/tests/test_utils.py index 463ba04..5c27b37 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,25 +23,31 @@ def test_resolve_flavor_with_valid(potential_flavor, expected): assert regex_toolkit.base.resolve_flavor(potential_flavor) == expected -@mock.patch("regex_toolkit.utils.default_flavor", None) +@mock.patch("regex_toolkit.base.default_flavor", None) def test_resolve_flavor_with_invalid_and_with_no_default_raises_value_error(): with pytest.raises(ValueError, match=r"^Invalid regex flavor: None$"): regex_toolkit.base.resolve_flavor(None) @pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) -@mock.patch("regex_toolkit.utils.default_flavor", RegexFlavor.RE) +@mock.patch("regex_toolkit.base.default_flavor", RegexFlavor.RE) def test_resolve_flavor_falls_back_to_default(potential_flavor): regex_toolkit.base.resolve_flavor(potential_flavor) == RegexFlavor.RE @pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) -@mock.patch("regex_toolkit.utils.default_flavor", None) +@mock.patch("regex_toolkit.base.default_flavor", None) def test_resolve_flavor_invalid_int_without_default_raises(potential_flavor): with pytest.raises(ValueError, match=r"^Invalid regex flavor: (None|'?\d'?)$"): regex_toolkit.base.resolve_flavor(potential_flavor) +@mock.patch("regex_toolkit.base.default_flavor", None) +def test_default_flavor_can_be_set(): + regex_toolkit.base.default_flavor = 2 + assert regex_toolkit.base.resolve_flavor(None) == RegexFlavor.RE2 + + def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: prev_len = None for text in texts: From ac789223223e744a8f805d372b4f1e8eb90b7c88 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sat, 30 Sep 2023 23:00:54 -0500 Subject: [PATCH 16/16] Readme clean up --- README.md | 26 ++++++-------------------- docs/templates/usage.md.jinja | 26 ++++++-------------------- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index e2a46ff..74375b0 100644 --- a/README.md +++ b/README.md @@ -66,31 +66,17 @@ import re2 import regex_toolkit as rtk ``` -For instance, if you wish to create a regex pattern that matches all unicode letters and marks, and supplement it with additional code points consistent with Objective-C, you can do it as follows: - -```python -unicode_letters_and_marks = r"\p{L}\p{M}" + rtk.make_exp( - [ - *rtk.char_range("\uf870", "\uf87f"), - "\uf882", - *rtk.char_range("\uf884", "\uf89f"), - "\uf8b8", - *rtk.char_range("\uf8c1", "\uf8d6"), - ], - flavor=2, -) -# Output: r'\p{L}\p{M}\x{f870}-\x{f87f}\x{f882}\x{f884}-\x{f89f}\x{f8b8}\x{f8c1}-\x{f8d6}' -``` - -This representation is more intuitive and maintainable than conventional methods. - ### Why Use `regex_toolkit`? -Standard unicode regex groups (like `\p{L}` and `\p{M}`) have definitions that vary across languages and versions. By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. +Regex definitions vary across languages and versions. +By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. +It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. ### RE2 Overview -RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. It ensures both linear match time and efficient memory usage. Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. +It ensures both linear match time and efficient memory usage. +Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. A brief rundown of RE2 terminology: diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja index 458c5ca..e7166fe 100644 --- a/docs/templates/usage.md.jinja +++ b/docs/templates/usage.md.jinja @@ -7,31 +7,17 @@ import re2 import regex_toolkit as rtk ``` -For instance, if you wish to create a regex pattern that matches all unicode letters and marks, and supplement it with additional code points consistent with Objective-C, you can do it as follows: - -```python -unicode_letters_and_marks = r"\p{L}\p{M}" + rtk.make_exp( - [ - *rtk.char_range("\uf870", "\uf87f"), - "\uf882", - *rtk.char_range("\uf884", "\uf89f"), - "\uf8b8", - *rtk.char_range("\uf8c1", "\uf8d6"), - ], - flavor=2, -) -# Output: r'\p{L}\p{M}\x{f870}-\x{f87f}\x{f882}\x{f884}-\x{f89f}\x{f8b8}\x{f8c1}-\x{f8d6}' -``` - -This representation is more intuitive and maintainable than conventional methods. - ### Why Use `regex_toolkit`? -Standard unicode regex groups (like `\p{L}` and `\p{M}`) have definitions that vary across languages and versions. By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. +Regex definitions vary across languages and versions. +By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. +It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. ### RE2 Overview -RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. It ensures both linear match time and efficient memory usage. Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. +It ensures both linear match time and efficient memory usage. +Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. A brief rundown of RE2 terminology: