diff --git a/.gitignore b/.gitignore index 5013d26..b04526a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ !environment.yml !codecov.yml !requirements-doc.txt +!requirements-test.txt !src/ !src/* diff --git a/Makefile b/Makefile index 8392872..9115bef 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PYTHON=python3 -APP_NAME=regex-toolkit +APP_NAME=regex_toolkit install: ${PYTHON} -m pip install . diff --git a/README.md b/README.md index 1be1bae..74375b0 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/regex-toolkit/): [![PyPI - License](https://img.shields.io/pypi/l/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) ```bash -$ python3 -m pip install regex-toolkit +python3 -m pip install regex-toolkit ``` Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): @@ -48,26 +48,49 @@ Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): ```bash -$ git clone git+https://github.com/Phosmic/regex-toolkit.git -$ cd regex-toolkit -$ python3 -m pip install -e . +git clone git+https://github.com/Phosmic/regex-toolkit.git +cd regex-toolkit +python3 -m pip install -e . ``` --- ## Usage -Import packages: +To harness the toolkit's capabilities, you should import the necessary packages: ```python import re # and/or import re2 +import regex_toolkit as rtk ``` -```python -import regex_toolkit -``` +### Why Use `regex_toolkit`? + +Regex definitions vary across languages and versions. +By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. +It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. + +### RE2 Overview + +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. +It ensures both linear match time and efficient memory usage. +Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. + +A brief rundown of RE2 terminology: + +- **BitState**: An execution engine that uses backtracking search. +- **bytecode**: The set of instructions that form an automaton. +- **DFA**: The engine for Deterministic Finite Automaton searches. +- **NFA**: Implements the Nondeterministic Finite Automaton search method. +- **OnePass**: A one-pass search execution engine. +- **pattern**: The textual form of a regex. +- **Prog**: The compiled version of a regex. +- **Regexp**: The parsed version of a regex. +- **Rune**: A character in terms of encoding, essentially a code point. + +For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary). --- @@ -77,6 +100,39 @@ import regex_toolkit # `regex_toolkit.utils` + + +#### `resolve_flavor` + +```python +def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor +``` + +Resolve a regex flavor. + +If the flavor is an integer, it is validated and returned. +If the flavor is a RegexFlavor, it is returned. +If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`. + +```python +import regex_toolkit as rtk + +rtk.base.default_flavor = 2 +assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 +``` + +**Arguments**: + +- `potential_flavor` _int | RegexFlavor | None_ - Potential regex flavor. + +**Returns**: + +- _RegexFlavor_ - Resolved regex flavor. + +**Raises**: + +- `ValueError` - Invalid regex flavor. + #### `iter_sort_by_len` @@ -134,8 +190,8 @@ The codepoint is always 8 characters long (zero-padded). **Example**: ```python -# Output: '00000061' ord_to_cpoint(97) +# Output: '00000061' ``` **Arguments**: @@ -177,8 +233,8 @@ Character to character codepoint. **Example**: ```python -# Output: '00000061' char_to_cpoint("a") +# Output: '00000061' ``` **Arguments**: @@ -201,6 +257,13 @@ Normalize a Unicode string to NFC form C. Form C favors the use of a fully combined character. +**Example**: + +```python +to_nfc("e\\u0301") == "Γ©" +# Output: True +``` + **Arguments**: - `text` _str_ - String to normalize. @@ -214,39 +277,59 @@ Form C favors the use of a fully combined character. #### `iter_char_range` ```python -def iter_char_range(first_cpoint: int, - last_cpoint: int) -> Generator[str, None, None] +def iter_char_range(first_char: str, + last_char: str) -> Generator[str, None, None] ``` -Iterate all characters within a range of codepoints (inclusive). +Iterate all characters within a range of characters (inclusive). + +**Example**: + +```python +char_range("a", "c") +# Output: ('a', 'b', 'c') + +char_range("c", "a") +# Output: ('c', 'b', 'a') +``` **Arguments**: -- `first_cpoint` _int_ - Starting (first) codepoint. -- `last_cpoint` _int_ - Ending (last) codepoint. +- `first_char` _str_ - Starting (first) character. +- `last_char` _str_ - Ending (last) character. **Yields**: -- _str_ - Characters within a range of codepoints. +- _str_ - Characters within a range of characters. #### `char_range` ```python -def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...] +def char_range(first_char: str, last_char: str) -> tuple[str, ...] ``` -Tuple of all characters within a range of codepoints (inclusive). +Tuple of all characters within a range of characters (inclusive). + +**Example**: + +```python +char_range("a", "d") +# Output: ('a', 'b', 'c', 'd') + +char_range("d", "a") +# Output: ('d', 'c', 'b', 'a') +``` **Arguments**: -- `first_cpoint` _int_ - Starting (first) codepoint. -- `last_cpoint` _int_ - Ending (last) codepoint. +- `first_char` _str_ - Starting (first) character. +- `last_char` _str_ - Ending (last) character. **Returns**: -- _tuple[str, ...]_ - Characters within a range of codepoints. +- _tuple[str, ...]_ - Characters within a range of characters. @@ -303,7 +386,7 @@ Todo: Add support for overlapping (and unordered?) spans. #### `escape` ```python -def escape(char: str, flavor: int = 1) -> str +def escape(char: str, flavor: int | None = None) -> str ``` Create a regex expression that exactly matches a character. @@ -311,7 +394,7 @@ Create a regex expression that exactly matches a character. **Arguments**: - `char` _str_ - Character to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -326,7 +409,7 @@ Create a regex expression that exactly matches a character. #### `string_as_exp` ```python -def string_as_exp(text: str, flavor: int = 1) -> str +def string_as_exp(text: str, flavor: int | None = None) -> str ``` Create a regex expression that exactly matches a string. @@ -334,7 +417,7 @@ Create a regex expression that exactly matches a string. **Arguments**: - `text` _str_ - String to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -349,7 +432,7 @@ Create a regex expression that exactly matches a string. #### `strings_as_exp` ```python -def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str +def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str ``` Create a regex expression that exactly matches any one string. @@ -357,7 +440,7 @@ Create a regex expression that exactly matches any one string. **Arguments**: - `texts` _Iterable[str]_ - Strings to match. -- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. **Returns**: @@ -367,6 +450,39 @@ Create a regex expression that exactly matches any one string. - `ValueError` - Invalid regex flavor. + + +#### `make_exp` + +```python +def make_exp(chars: Iterable[str], flavor: int | None = None) -> str +``` + +Create a regex expression that exactly matches a list of characters. + +The characters are sorted and grouped into ranges where possible. +The expression is not anchored, so it can be used as part of a larger expression. + +**Example**: + +```python +exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]" +# Output: '[a-cx-z]' +``` + +**Arguments**: + +- `chars` _Iterable[str]_ - Characters to match. +- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None. + +**Returns**: + +- _str_ - Expression that exactly matches the original characters. + +**Raises**: + +- `ValueError` - Invalid regex flavor. + # `regex_toolkit.enums` diff --git a/ci/deps/actions-310.yml b/ci/deps/actions-310.yml index a3a6672..8bd2b2a 100644 --- a/ci/deps/actions-310.yml +++ b/ci/deps/actions-310.yml @@ -8,8 +8,5 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/ci/deps/actions-311.yml b/ci/deps/actions-311.yml index 4a16510..b816df2 100644 --- a/ci/deps/actions-311.yml +++ b/ci/deps/actions-311.yml @@ -8,8 +8,5 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/docs/templates/install.md.jinja b/docs/templates/install.md.jinja index dcc34d3..7a86e51 100644 --- a/docs/templates/install.md.jinja +++ b/docs/templates/install.md.jinja @@ -5,7 +5,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/{{ pypi.name }}/): [![PyPI - License](https://img.shields.io/pypi/l/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) ```bash -$ python3 -m pip install {{ pypi.name }} +python3 -m pip install {{ pypi.name }} ``` Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ repo.name }}): @@ -21,7 +21,7 @@ Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ rep {% endif %} ```bash -$ git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git -$ cd {{ repo.name }} -$ python3 -m pip install -e . +git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git +cd {{ repo.name }} +python3 -m pip install -e . ``` \ No newline at end of file diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja index 393c0d8..e7166fe 100644 --- a/docs/templates/usage.md.jinja +++ b/docs/templates/usage.md.jinja @@ -1,11 +1,34 @@ -Import packages: +To harness the toolkit's capabilities, you should import the necessary packages: ```python import re # and/or import re2 +import regex_toolkit as rtk ``` -```python -import regex_toolkit -``` \ No newline at end of file +### Why Use `regex_toolkit`? + +Regex definitions vary across languages and versions. +By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support. +It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards. + +### RE2 Overview + +RE2 focuses on safely processing regular expressions, particularly from untrusted inputs. +It ensures both linear match time and efficient memory usage. +Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions. + +A brief rundown of RE2 terminology: + +- **BitState**: An execution engine that uses backtracking search. +- **bytecode**: The set of instructions that form an automaton. +- **DFA**: The engine for Deterministic Finite Automaton searches. +- **NFA**: Implements the Nondeterministic Finite Automaton search method. +- **OnePass**: A one-pass search execution engine. +- **pattern**: The textual form of a regex. +- **Prog**: The compiled version of a regex. +- **Regexp**: The parsed version of a regex. +- **Rune**: A character in terms of encoding, essentially a code point. + +For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary). \ No newline at end of file diff --git a/environment.yml b/environment.yml index 367bdfc..09760df 100644 --- a/environment.yml +++ b/environment.yml @@ -10,10 +10,7 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 - # - pytest-asyncio>=0.17 - coverage - - # Required dependencies - pip: - google-re2>=1.0 diff --git a/pyproject.toml b/pyproject.toml index fe46401..3806815 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["google-re2>=1.0"] +dependencies = [] dynamic = ["version"] [project.urls] diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..bed0b62 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,4 @@ +pytest>=7.0.0 +pytest-cov +pytest-xdist>=2.2.0 +google-re2>=1.0 diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index de90ba6..7303ef7 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -1,8 +1,10 @@ from .base import ( escape, + make_exp, string_as_exp, strings_as_exp, ) +from .enums import RegexFlavor from .utils import ( char_range, char_to_cpoint, @@ -17,18 +19,20 @@ to_utf8, ) -__version__ = "0.0.4" +__version__ = "0.0.5" __all__ = [ - "escape", "char_range", "char_to_cpoint", "cpoint_to_ord", + "escape", "iter_char_range", "iter_sort_by_len", + "make_exp", "mask_span", "mask_spans", "ord_to_cpoint", + "RegexFlavor", "sort_by_len", "string_as_exp", "strings_as_exp", diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index ca6a983..d59aea2 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -1,38 +1,21 @@ __all__ = [ + "default_flavor", "escape", "string_as_exp", "strings_as_exp", + "make_exp", ] from collections.abc import Iterable from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor -from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len +from regex_toolkit.utils import ( + char_to_cpoint, + iter_sort_by_len, + resolve_flavor, +) - -def escape(char: str, flavor: int = 1) -> str: - """Create a regex expression that exactly matches a character. - - Args: - char (str): Character to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. - - Returns: - str: Expression that exactly matches the original character. - - Raises: - ValueError: Invalid regex flavor. - """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: - return _escape(char) - # elif flavor == RegexFlavor.RE2: - else: - return _escape2(char) +default_flavor: int | RegexFlavor | None = RegexFlavor.RE def _escape(char: str) -> str: @@ -53,32 +36,25 @@ def _escape2(char: str) -> str: return f"\\{char}" else: # Otherwise escape using the codepoint - return "\\x{" + char_to_cpoint(char) + "}" + return "\\x{" + char_to_cpoint(char).removeprefix("0000") + "}" -def string_as_exp(text: str, flavor: int = 1) -> str: - """Create a regex expression that exactly matches a string. +def escape(char: str, flavor: int | None = None) -> str: + """Create a regex expression that exactly matches a character. Args: - text (str): String to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + char (str): Character to match. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: - str: Expression that exactly matches the original string. + str: Expression that exactly matches the original character. Raises: ValueError: Invalid regex flavor. """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: - return _string_as_exp(text) - # elif flavor == RegexFlavor.RE2: - else: - return _string_as_exp2(text) + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + return _escape(char) + return _escape2(char) def _string_as_exp(text: str) -> str: @@ -89,12 +65,38 @@ def _string_as_exp2(text: str) -> str: return r"".join(map(_escape2, text)) -def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: +def string_as_exp(text: str, flavor: int | None = None) -> str: + """Create a regex expression that exactly matches a string. + + Args: + text (str): String to match. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. + + Returns: + str: Expression that exactly matches the original string. + + Raises: + ValueError: Invalid regex flavor. + """ + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + return _string_as_exp(text) + return _string_as_exp2(text) + + +def _strings_as_exp(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) + + +def _strings_as_exp2(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) + + +def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str: """Create a regex expression that exactly matches any one string. Args: texts (Iterable[str]): Strings to match. - flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. Returns: str: Expression that exactly matches any one of the original strings. @@ -102,21 +104,70 @@ def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: Raises: ValueError: Invalid regex flavor. """ - try: - flavor = RegexFlavor(flavor) - except ValueError: - raise ValueError(f"Invalid regex flavor: {flavor}") - - if flavor == RegexFlavor.RE: + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: return _strings_as_exp(texts) - # elif flavor == RegexFlavor.RE2: + return _strings_as_exp2(texts) + + +def _make_group_exp(group: list[int]) -> str: + if len(group) > 2: + # Represent as a character range + return _escape(chr(group[0])) + "-" + _escape(chr(group[-1])) else: - return _strings_as_exp2(texts) + # Represent as individual characters + return "".join((_escape(chr(char_ord)) for char_ord in group)) -def _strings_as_exp(texts: Iterable[str]) -> str: - return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) +def _make_group_exp2(group: list[int]) -> str: + if len(group) > 2: + # Represent as a character range + return _escape2(chr(group[0])) + "-" + _escape2(chr(group[-1])) + else: + # Represent as individual characters + return "".join((_escape2(chr(char_ord)) for char_ord in group)) -def _strings_as_exp2(texts: Iterable[str]) -> str: - return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) +def make_exp(chars: Iterable[str], flavor: int | None = None) -> str: + """Create a regex expression that exactly matches a list of characters. + + The characters are sorted and grouped into ranges where possible. + The expression is not anchored, so it can be used as part of a larger expression. + + Example: + + ```python + exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]" + # Output: '[a-cx-z]' + ``` + + Args: + chars (Iterable[str]): Characters to match. + flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None. + + Returns: + str: Expression that exactly matches the original characters. + + Raises: + ValueError: Invalid regex flavor. + """ + if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE: + func = _make_group_exp + func = _make_group_exp2 + + exp = "" + group = [] + for char_ord in sorted(set(map(ord, chars))): + if not group: + # Start first group + group.append(char_ord) + elif char_ord == group[-1] + 1: + # Add to current group + group.append(char_ord) + else: + # Make the group and start a new one + exp += func(group) + group = [char_ord] + if group: + # Make any remaining group + exp += func(group) + return exp diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index 0d7cc43..542a922 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -2,16 +2,19 @@ This module contains constant values used throughout the project. """ +from __future__ import annotations -from typing import Final +from typing import TYPE_CHECKING -from regex_toolkit.enums import RegexFlavor +if TYPE_CHECKING: + from typing import Final __all__ = [ "ALWAYS_ESCAPE", "ALWAYS_SAFE", "ASCIILETTERS", "DIGITS", + "RESERVED_EXPRESSIONS", ] DIGITS: Final[frozenset[str]] = frozenset(map(chr, b"0123456789")) @@ -22,7 +25,6 @@ ALWAYS_ESCAPE: Final[frozenset[str]] = frozenset( map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) - -REGEX_FLAVORS: Final[frozenset[RegexFlavor]] = frozenset( - {RegexFlavor.RE, RegexFlavor.RE2} +RESERVED_EXPRESSIONS: Final[frozenset[str]] = frozenset( + {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"} ) diff --git a/src/regex_toolkit/enums.py b/src/regex_toolkit/enums.py index 55a9b9d..4f7c8cc 100644 --- a/src/regex_toolkit/enums.py +++ b/src/regex_toolkit/enums.py @@ -2,6 +2,11 @@ from enum import Enum +__all__ = [ + "ALL_REGEX_FLAVORS", + "RegexFlavor", +] + class RegexFlavor(int, Enum): """Regex flavors. @@ -13,3 +18,6 @@ class RegexFlavor(int, Enum): RE = 1 RE2 = 2 + + +ALL_REGEX_FLAVORS: list[RegexFlavor] = [RegexFlavor.RE, RegexFlavor.RE2] diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index 41c9df3..81a35d5 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -1,21 +1,60 @@ import unicodedata from collections.abc import Generator, Iterable +import regex_toolkit.base +from regex_toolkit.enums import RegexFlavor + __all__ = [ - "iter_sort_by_len", - "sort_by_len", - "ord_to_cpoint", - "cpoint_to_ord", + "char_range", "char_to_cpoint", - "to_utf8", - "to_nfc", + "cpoint_to_ord", "iter_char_range", - "char_range", + "iter_sort_by_len", "mask_span", "mask_spans", + "ord_to_cpoint", + "resolve_flavor", + "sort_by_len", + "to_nfc", + "to_utf8", ] +def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor: + """Resolve a regex flavor. + + If the flavor is an integer, it is validated and returned. + If the flavor is a RegexFlavor, it is returned. + If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`. + + ```python + import regex_toolkit as rtk + + rtk.base.default_flavor = 2 + assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2 + ``` + + Args: + potential_flavor (int | RegexFlavor | None): Potential regex flavor. + + Returns: + RegexFlavor: Resolved regex flavor. + + Raises: + ValueError: Invalid regex flavor. + """ + try: + return RegexFlavor(potential_flavor) + except ValueError as err: + if regex_toolkit.base.default_flavor is not None: + try: + return RegexFlavor(regex_toolkit.base.default_flavor) + except ValueError as err: + raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err + else: + raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err + + def iter_sort_by_len( texts: Iterable[str], *, @@ -59,8 +98,8 @@ def ord_to_cpoint(ordinal: int) -> str: Example: ```python - # Output: '00000061' ord_to_cpoint(97) + # Output: '00000061' ``` Args: @@ -90,8 +129,8 @@ def char_to_cpoint(char: str) -> str: Example: ```python - # Output: '00000061' char_to_cpoint("a") + # Output: '00000061' ``` Args: @@ -112,6 +151,13 @@ def to_nfc(text: str) -> str: Form C favors the use of a fully combined character. + Example: + + ```python + to_nfc("e\\u0301") == "Γ©" + # Output: True + ``` + Args: text (str): String to normalize. @@ -121,31 +167,57 @@ def to_nfc(text: str) -> str: return unicodedata.normalize("NFC", text) -def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]: - """Iterate all characters within a range of codepoints (inclusive). +def iter_char_range(first_char: str, last_char: str) -> Generator[str, None, None]: + """Iterate all characters within a range of characters (inclusive). + + Example: + + ```python + char_range("a", "c") + # Output: ('a', 'b', 'c') + + char_range("c", "a") + # Output: ('c', 'b', 'a') + ``` Args: - first_cpoint (int): Starting (first) codepoint. - last_cpoint (int): Ending (last) codepoint. + first_char (str): Starting (first) character. + last_char (str): Ending (last) character. Yields: - str: Characters within a range of codepoints. + str: Characters within a range of characters. """ - for i in range(ord(first_cpoint), ord(last_cpoint) + 1): - yield chr(i) + first_ord = ord(first_char) + last_ord = ord(last_char) + if first_ord > last_ord: + ord_range = range(first_ord, last_ord - 1, -1) + else: + ord_range = range(first_ord, last_ord + 1) + for ordinal in ord_range: + yield chr(ordinal) -def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]: - """Tuple of all characters within a range of codepoints (inclusive). +def char_range(first_char: str, last_char: str) -> tuple[str, ...]: + """Tuple of all characters within a range of characters (inclusive). + + Example: + + ```python + char_range("a", "d") + # Output: ('a', 'b', 'c', 'd') + + char_range("d", "a") + # Output: ('d', 'c', 'b', 'a') + ``` Args: - first_cpoint (int): Starting (first) codepoint. - last_cpoint (int): Ending (last) codepoint. + first_char (str): Starting (first) character. + last_char (str): Ending (last) character. Returns: - tuple[str, ...]: Characters within a range of codepoints. + tuple[str, ...]: Characters within a range of characters. """ - return tuple(iter_char_range(first_cpoint, last_cpoint)) + return tuple(iter_char_range(first_char, last_char)) def mask_span( diff --git a/tests/test_base.py b/tests/test_base.py index afc1259..4020eae 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,555 +1,318 @@ +# import random import re -import unittest +from collections.abc import Iterable from itertools import product +import pytest import re2 import regex_toolkit -from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE -from regex_toolkit.enums import RegexFlavor - - -class TestEscapeRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - - def test_safe(self): - for char in ALWAYS_SAFE: - with self.subTest(char=char): - expected_exp = char - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_escapable(self): - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_unknown(self): - # TODO: Include additional characters to test. - for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - -class TestEscapeRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - - def test_safe(self): - for char in ALWAYS_SAFE: - with self.subTest(char=char): - expected_exp = char - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_escapable(self): - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - def test_unknown(self): - # TODO: Include additional characters to test. - # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8) + "}" - for char, expected_exp in ( - # Length 1 - ("πŸ…°", r"\x{0001f170}"), - ("πŸ…±", r"\x{0001f171}"), - ("πŸ…Ύ", r"\x{0001f17e}"), - ("πŸ…Ώ", r"\x{0001f17f}"), - ("πŸ†Ž", r"\x{0001f18e}"), - ("πŸ†‘", r"\x{0001f191}"), - ("πŸ†’", r"\x{0001f192}"), - ("πŸ†“", r"\x{0001f193}"), - ("πŸ†”", r"\x{0001f194}"), - ("πŸ†•", r"\x{0001f195}"), - ("πŸ†–", r"\x{0001f196}"), - ("πŸ†—", r"\x{0001f197}"), - ("πŸ†˜", r"\x{0001f198}"), - ("πŸ†™", r"\x{0001f199}"), - ("πŸ†š", r"\x{0001f19a}"), - ("πŸ‡¦", r"\x{0001f1e6}"), - ("πŸ‡§", r"\x{0001f1e7}"), - ("πŸ‡¨", r"\x{0001f1e8}"), - ("🈁", r"\x{0001f201}"), - ("πŸˆ‚", r"\x{0001f202}"), - ("🈚", r"\x{0001f21a}"), - ("🈯", r"\x{0001f22f}"), - ("🈲", r"\x{0001f232}"), - ("🈳", r"\x{0001f233}"), - ("🈴", r"\x{0001f234}"), - ("🈡", r"\x{0001f235}"), - ("🈢", r"\x{0001f236}"), - ("🈷", r"\x{0001f237}"), - ("🈸", r"\x{0001f238}"), - ("🈹", r"\x{0001f239}"), - ("🈺", r"\x{0001f23a}"), - ("πŸ‰", r"\x{0001f250}"), - ("πŸ‰‘", r"\x{0001f251}"), - ("πŸŒ€", r"\x{0001f300}"), - ("🌁", r"\x{0001f301}"), - ("πŸŒ‚", r"\x{0001f302}"), - ("πŸŒƒ", r"\x{0001f303}"), - ("πŸŒ„", r"\x{0001f304}"), - # Length 2 - ("πŸŒ…", r"\x{0001f305}"), - ): - with self.subTest(char=char): - actual_exp = regex_toolkit.escape(char, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the character. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(char)) - - -class TestStringAsExpressionRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - - def test_safe_individual_char(self): - # Single character. - for char in ALWAYS_SAFE: - with self.subTest(char=char): - text = char - expected_exp = char - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - # TODO: Add tests for mix of characters. - def test_safe_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_SAFE) - expected_exp = text - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_individual_char(self): - # Single character. - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - text = char - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_ESCAPE) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_unsafe_joined_as_one(self): - # All characters. - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - expected_exp = "".join(f"\\{char}" for char in text) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - -class TestStringAsExpressionRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - - # TODO: Add tests for mix of characters. - def test_safe_individual_char(self): - # Single character. - for char in ALWAYS_SAFE: - with self.subTest(char=char): - text = char - expected_exp = char - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_safe_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_SAFE) - expected_exp = "".join(ALWAYS_SAFE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_individual_char(self): - # Single character. - for char in ALWAYS_ESCAPE: - with self.subTest(char=char): - text = char - expected_exp = f"\\{char}" - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_escapable_joined_as_one(self): - # All characters. - text = "".join(ALWAYS_ESCAPE) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) - - def test_unknown_joined_as_one(self): - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - expected_exp = r"".join( - ( - r"\x{0001f170}", - r"\x{0001f171}", - r"\x{0001f17e}", - r"\x{0001f17f}", - r"\x{0001f18e}", - r"\x{0001f191}", - r"\x{0001f192}", - r"\x{0001f193}", - r"\x{0001f194}", - r"\x{0001f195}", - r"\x{0001f196}", - r"\x{0001f197}", - r"\x{0001f198}", - r"\x{0001f199}", - r"\x{0001f19a}", - r"\x{0001f1e6}", - r"\x{0001f1e7}", - r"\x{0001f1e8}", - r"\x{0001f201}", - r"\x{0001f202}", - r"\x{0001f21a}", - r"\x{0001f22f}", - r"\x{0001f232}", - r"\x{0001f233}", - r"\x{0001f234}", - r"\x{0001f235}", - r"\x{0001f236}", - r"\x{0001f237}", - r"\x{0001f238}", - r"\x{0001f239}", - r"\x{0001f23a}", - r"\x{0001f250}", - r"\x{0001f251}", - r"\x{0001f300}", - r"\x{0001f301}", - r"\x{0001f302}", - r"\x{0001f303}", - r"\x{0001f304}", - # Length 2 - r"\x{0001f305}", - ) +from regex_toolkit.constants import ( + ALWAYS_ESCAPE, + ALWAYS_SAFE, + RESERVED_EXPRESSIONS, +) +from regex_toolkit.enums import ALL_REGEX_FLAVORS, RegexFlavor + +NON_ASCII_CHARS = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + + +def _exp_will_match(exp: str, text: str, flavor: int) -> bool: + if flavor == 1: + return bool(re.fullmatch(exp, text)) + if flavor == 2: + return bool(re2.fullmatch(exp, text)) + raise ValueError(f"Invalid regex flavor: {flavor!r}") + + +def assert_exp_will_match(exp: str, text: str, flavor: int) -> bool: + assert _exp_will_match( + exp, text, flavor + ), f"RE{flavor} Pattern: {exp!r} does not match {text!r}" + + +def assert_exp_will_match_all(exp: str, texts: Iterable[str], flavor: int) -> bool: + for text in texts: + assert_exp_will_match(exp, text, flavor) + + +# RE and RE2 - Escape + + +@pytest.mark.parametrize("char, expected", [(char, char) for char in ALWAYS_SAFE]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_escape_and_escape2_safe(char, expected, flavor): + actual = regex_toolkit.escape(char, flavor) + assert actual == expected + assert_exp_will_match(actual, char, flavor) + + +@pytest.mark.parametrize( + "char, expected_exp", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_escape_and_escape2_escapable(char, expected_exp, flavor): + actual = regex_toolkit.escape(char, flavor) + assert actual == expected_exp + assert_exp_will_match(actual, char, flavor) + + +# RE - Escape + + +@pytest.mark.parametrize( + "char, expected_exp", + [(char, f"\\{char}") for char in NON_ASCII_CHARS], +) +def test_escape_unknown(char, expected_exp): + actual = regex_toolkit.escape(char, RegexFlavor.RE) + assert actual == expected_exp + assert_exp_will_match(actual, char, RegexFlavor.RE) + + +# RE2 - Escape + + +@pytest.mark.parametrize( + "char, expected", + [ + (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}") + for char in NON_ASCII_CHARS + ], +) +def test_escape2_unknown(char, expected): + actual = regex_toolkit.escape(char, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, char, RegexFlavor.RE2) + + +def test_escape2_trimmed(): + text = "Β°" + expected = "\\x{00b0}" + actual = regex_toolkit.escape(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +def test_escape2_untrimmed(): + text = "πŸ…°" + expected = "\\x{0001f170}" + actual = regex_toolkit.escape(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +# RE and RE2 - String as expression + + +@pytest.mark.parametrize("text, expected", [(text, text) for text in ALWAYS_SAFE]) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_string_as_exp_and_exp2_safe_individual_char(text, expected, flavor): + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_string_as_exp_and_exp2_safe_joined_as_one(flavor): + text = "".join(ALWAYS_SAFE) + expected = "".join(ALWAYS_SAFE) + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize( + "text, expected", [(char, f"\\{char}") for char in ALWAYS_ESCAPE] +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_string_as_exp_and_exp2_escapable_individual_char(text, expected, flavor): + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_string_as_exp_and_exp2_escapable_joined_as_one(flavor): + text = "".join(ALWAYS_ESCAPE) + expected = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) + actual = regex_toolkit.string_as_exp(text, flavor) + assert actual == expected + assert_exp_will_match(actual, text, flavor) + + +# RE - String as expression + + +@pytest.mark.parametrize( + "text, expected", + [(text, f"\\{text}") for text in NON_ASCII_CHARS], +) +def test_string_as_exp_unsafe_individual_char(text, expected): + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE) + + +def test_string_as_exp_unsafe_joined_as_one(): + text = "".join(NON_ASCII_CHARS) + expected = "".join(f"\\{char}" for char in text) + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE) + + +# RE2 - String as expression + + +@pytest.mark.parametrize( + "text, expected", + [ + (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}") + for char in NON_ASCII_CHARS + ], +) +def test_string_as_exp2_unknown_individual_char(text, expected): + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +def test_string_as_exp2_unknown_joined_as_one(): + text = "".join(NON_ASCII_CHARS) + expected = "".join( + "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}" + for char in text + ) + actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match(actual, text, RegexFlavor.RE2) + + +# RE and RE2 - Strings as expression + + +@pytest.mark.parametrize( + "texts, expected", + [(texts, r"|".join(texts)) for texts in product(ALWAYS_SAFE, repeat=2)], +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_strings_as_exp_and_exp2_safe_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(ALWAYS_ESCAPE, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_strings_as_exp_and_exp2_escapable_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(RESERVED_EXPRESSIONS, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_strings_as_exp_and_exp2_reserved_of_various_lengths(texts, expected, flavor): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + + +@pytest.mark.parametrize( + "texts, expected", + [ + ( + texts, + r"|".join(text if text in ALWAYS_SAFE else f"\\{text}" for text in texts), ) - actual_exp = regex_toolkit.string_as_exp(text, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches the string. - pattern = self._re_compile(actual_exp) - self.assertTrue(pattern.match(text)) + for texts in product(ALWAYS_SAFE | ALWAYS_ESCAPE, repeat=2) + ], +) +@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS) +def test_strings_as_exp_and_exp2_safe_and_escapable_of_various_lengths( + texts, expected, flavor +): + actual = regex_toolkit.strings_as_exp(texts, flavor) + assert actual == expected + assert_exp_will_match_all(actual, texts, flavor) + +# RE - Strings as expression -RESERVED_EXPRESSIONS = frozenset( - {r"\A", r"\b", r"\B", r"\d", r"\D", r"\s", r"\S", r"\w", r"\W", r"\Z", r"\1"} + +@pytest.mark.parametrize( + "texts, expected", + [ + (texts, r"|".join(f"\\{text}" for text in texts)) + for texts in product(NON_ASCII_CHARS, repeat=2) + ], ) +def test_strings_as_exp_unsafe_of_various_lengths(texts, expected): + actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE) + assert actual == expected + assert_exp_will_match_all(actual, texts, RegexFlavor.RE) + +# RE2 - Strings as expression -class StringsAsExpressionRE(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE - self._re_compile = re.compile - self._max_combo_length = 2 - - def test_safe_of_various_lengths(self): - # Unique combinations of `ALWAYS_SAFE` using various lengths. - elements = ALWAYS_SAFE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_escapable_of_various_lengths(self): - # Unique combinations of `ALWAYS_ESCAPE` using various lengths. - elements = ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_reserved_of_various_lengths(self): - # Unique combinations of reserved expressions using various lengths. - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'n', not r'\n' - elements = RESERVED_EXPRESSIONS - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_unsafe_of_various_lengths(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of `ALWAYS_SAFE` using various lengths. - elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_safe_and_escapable_of_various_lengths(self): - # Unique combinations of `ALWAYS_SAFE` and `ALWAYS_ESCAPE` using various lengths. - elements = ALWAYS_SAFE | ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join( - text if text in ALWAYS_SAFE else f"\\{text}" for text in texts - ) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - # def test_actual_examples(self): - # - - -###################### -###################### # Multiple unsafe char -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp([".", "!", "?"], self._flavor), -###################### "\\.|\\!|\\?", -###################### ) -###################### -###################### for texts, expected_exp in [ -###################### (["πŸ…°"], "\\πŸ…°"), -###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), -###################### (["alpha", "beta"], "alpha|beta"), -###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), -###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), -###################### ]: -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp(texts, self._flavor), -###################### expected_exp, -###################### ) - - -class StringsAsExpressionRE2(unittest.TestCase): - def setUp(self): - self._flavor = RegexFlavor.RE2 - self._re_compile = re2.compile - self._max_combo_length = 2 - - def test_safe_of_variable_lengths(self): - # Unique combinations of ALWAYS_SAFE using various lengths. - elements = set(ALWAYS_SAFE) - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_escapable_of_variable_lengths(self): - # Unique combinations of ALWAYS_ESCAPE using various lengths. - elements = ALWAYS_ESCAPE - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_reserved_of_variable_lengths(self): - # Unique combinations of reserved expressions using various lengths. - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'n', not r'\n' - elements = RESERVED_EXPRESSIONS - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(f"\\{text}" for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - def test_unsafe_of_variable_lengths(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of ALWAYS_SAFE using various lengths. - elements_map = { - # Length 1 - "πŸ…°": r"\x{0001f170}", - "πŸ…±": r"\x{0001f171}", - "πŸ…Ύ": r"\x{0001f17e}", - "πŸ…Ώ": r"\x{0001f17f}", - "πŸ†Ž": r"\x{0001f18e}", - "πŸ†‘": r"\x{0001f191}", - "πŸ†’": r"\x{0001f192}", - "πŸ†“": r"\x{0001f193}", - "πŸ†”": r"\x{0001f194}", - "πŸ†•": r"\x{0001f195}", - "πŸ†–": r"\x{0001f196}", - "πŸ†—": r"\x{0001f197}", - "πŸ†˜": r"\x{0001f198}", - "πŸ†™": r"\x{0001f199}", - "πŸ†š": r"\x{0001f19a}", - "πŸ‡¦": r"\x{0001f1e6}", - "πŸ‡§": r"\x{0001f1e7}", - "πŸ‡¨": r"\x{0001f1e8}", - "🈁": r"\x{0001f201}", - "πŸˆ‚": r"\x{0001f202}", - "🈚": r"\x{0001f21a}", - "🈯": r"\x{0001f22f}", - "🈲": r"\x{0001f232}", - "🈳": r"\x{0001f233}", - "🈴": r"\x{0001f234}", - "🈡": r"\x{0001f235}", - "🈢": r"\x{0001f236}", - "🈷": r"\x{0001f237}", - "🈸": r"\x{0001f238}", - "🈹": r"\x{0001f239}", - "🈺": r"\x{0001f23a}", - "πŸ‰": r"\x{0001f250}", - "πŸ‰‘": r"\x{0001f251}", - "πŸŒ€": r"\x{0001f300}", - "🌁": r"\x{0001f301}", - "πŸŒ‚": r"\x{0001f302}", - "πŸŒƒ": r"\x{0001f303}", - "πŸŒ„": r"\x{0001f304}", - # Length 2 - "πŸŒ…": r"\x{0001f305}", - } - elements = tuple(elements_map) - for texts in product(elements, repeat=self._max_combo_length): - with self.subTest(texts=texts): - expected_exp = r"|".join(elements_map[text] for text in texts) - actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) - self.assertEqual(actual_exp, expected_exp) - # Ensure the expression compiles and matches each of the strings. - pattern = self._re_compile(actual_exp) - for text in texts: - with self.subTest("match pattern", text=text): - self.assertTrue(pattern.match(text)) - - -##############################3 # Exact matches that equate to reserved spaces -##############################3 # E.g. Should match '\\' + 'n', not r'\n' -##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): -##############################3 texts = [text] -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 f"\\{text}", -##############################3 ) -##############################3 -##############################3 # Single whitespace char -##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), texts[0]) -##############################3 -##############################3 # Single unsafe char -##############################3 for texts, expected_exp in [ -##############################3 (["."], "\\."), -##############################3 (["!"], "\\!"), -##############################3 (["?"], "\\?"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 expected_exp, -##############################3 ) -##############################3 -##############################3 # Multiple unsafe char -##############################3 texts = [".", "!", "?"] -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), "\\.|\\!|\\?") -##############################3 -##############################3 for texts, expected_exp in [ -##############################3 (["πŸ…°"], "\\x{0001f170}"), -##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), -##############################3 (["alpha", "beta"], "alpha|beta"), -##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), -##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), -##############################3 expected_exp, -##############################3 ) - -# TODO: Add tests for actually compiling the e. + +@pytest.mark.parametrize( + "texts, expected", + [ + ( + texts, + r"|".join( + "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}" + for char in texts + ), + ) + for texts in product(*NON_ASCII_CHARS, repeat=2) + ], +) +def test_strings_as_exp2_unsafe_of_various_lengths(texts, expected): + actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2) + assert actual == expected + assert_exp_will_match_all(actual, texts, RegexFlavor.RE2) + + +# Make expression + + +@pytest.mark.parametrize( + "chars, expected", + ( + # 1 char does not make a range + (["a"], "a"), + # 2 chars should not make a range + (["a", "b"], "ab"), + # 3+ sequential chars make a range + (["a", "b", "c"], "a-c"), + # 3+ non-sequential chars should not make a range + (["a", "c", "e"], "ace"), + # 3+ sequential chars with extra out of range char + (["a", "b", "c", "z"], "a-cz"), + # Chars should always be ordered by ordinal + (["b", "a"], "ab"), + # Chars should always be ordered by ordinal + (["e", "c", "a"], "ace"), + # Chars should always be ordered by ordinal + (["z", "c", "b", "a"], "a-cz"), + # Duplicates should be removed + (["d", "a", "b", "c", "a"], "a-d"), + ), +) +def test_make_exp(chars, expected): + assert regex_toolkit.make_exp(chars, RegexFlavor.RE) == expected diff --git a/tests/test_enums.py b/tests/test_enums.py index 3af0dbd..d061d36 100644 --- a/tests/test_enums.py +++ b/tests/test_enums.py @@ -3,18 +3,23 @@ from regex_toolkit.enums import RegexFlavor -def test_regex_flavor_enum_is_int(): - assert isinstance(RegexFlavor.RE, int) - assert RegexFlavor.RE == 1 - assert RegexFlavor(1) == RegexFlavor.RE - assert isinstance(RegexFlavor.RE2, int) - assert RegexFlavor.RE2 == 2 - assert RegexFlavor(2) == RegexFlavor.RE2 +def test_regex_flavor_enum_has_expected_members(): + assert len(RegexFlavor) == 2 + assert len(set(RegexFlavor)) == len(RegexFlavor) + assert RegexFlavor.RE.name == "RE" + assert RegexFlavor.RE.value == RegexFlavor.RE == RegexFlavor(1) == 1 + assert RegexFlavor(1) is RegexFlavor.RE -def test_invalid_regex_flavor_raises_value_error(): - with pytest.raises(ValueError): - RegexFlavor(0) + assert RegexFlavor.RE2.name == "RE2" + assert RegexFlavor.RE2 == RegexFlavor.RE2.value == RegexFlavor(2) == 2 + assert RegexFlavor(2) is RegexFlavor.RE2 - with pytest.raises(ValueError): - RegexFlavor(3) + +@pytest.mark.parametrize("invalid_flavor", (0, 3)) +def test_invalid_regex_flavor_raises_value_error(invalid_flavor): + with pytest.raises( + ValueError, + match=f"^{invalid_flavor} is not a valid RegexFlavor$", + ): + RegexFlavor(invalid_flavor) diff --git a/tests/test_utils.py b/tests/test_utils.py index c1fccdf..5c27b37 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,51 @@ import unittest from collections.abc import Generator, Iterable +from unittest import mock + +import pytest import regex_toolkit +from regex_toolkit.enums import RegexFlavor + + +@pytest.mark.parametrize( + "potential_flavor, expected", + [ + (1, RegexFlavor.RE), + (2, RegexFlavor.RE2), + (RegexFlavor.RE, RegexFlavor.RE), + (RegexFlavor.RE2, RegexFlavor.RE2), + (RegexFlavor(1), RegexFlavor.RE), + (RegexFlavor(2), RegexFlavor.RE2), + ], +) +def test_resolve_flavor_with_valid(potential_flavor, expected): + assert regex_toolkit.base.resolve_flavor(potential_flavor) == expected + + +@mock.patch("regex_toolkit.base.default_flavor", None) +def test_resolve_flavor_with_invalid_and_with_no_default_raises_value_error(): + with pytest.raises(ValueError, match=r"^Invalid regex flavor: None$"): + regex_toolkit.base.resolve_flavor(None) + + +@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) +@mock.patch("regex_toolkit.base.default_flavor", RegexFlavor.RE) +def test_resolve_flavor_falls_back_to_default(potential_flavor): + regex_toolkit.base.resolve_flavor(potential_flavor) == RegexFlavor.RE + + +@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"]) +@mock.patch("regex_toolkit.base.default_flavor", None) +def test_resolve_flavor_invalid_int_without_default_raises(potential_flavor): + with pytest.raises(ValueError, match=r"^Invalid regex flavor: (None|'?\d'?)$"): + regex_toolkit.base.resolve_flavor(potential_flavor) + + +@mock.patch("regex_toolkit.base.default_flavor", None) +def test_default_flavor_can_be_set(): + regex_toolkit.base.default_flavor = 2 + assert regex_toolkit.base.resolve_flavor(None) == RegexFlavor.RE2 def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: @@ -19,81 +63,118 @@ def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: return True -class TestSortByLength(unittest.TestCase): - def setUp(self) -> None: - self.texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - self.texts_by_type = ( - (set, self.texts), - (frozenset, frozenset(self.texts)), - (tuple, tuple(self.texts)), - (list, list(self.texts)), - (dict, dict.fromkeys(self.texts, None)), - ) - - def test_iter_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, - typed_texts=typed_texts, - reverse=reverse, - ): - result = regex_toolkit.iter_sort_by_len( - typed_texts, - reverse=reverse, - ) - self.assertIsInstance(result, Generator) - result_tuple = tuple(result) - self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) - self.assertEqual( - result_tuple, - tuple(sorted(typed_texts, key=len, reverse=reverse)), - ) - - def test_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, - typed_texts=typed_texts, - reverse=reverse, - ): - result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) - self.assertIsInstance(result, tuple) - self.assertTrue(is_sorted_by_len(result, reverse=reverse)) - self.assertEqual( - result, - tuple(sorted(typed_texts, key=len, reverse=reverse)), - ) - - -class TestIterCharRange(unittest.TestCase): - def test_iter_char_range(self): - result = regex_toolkit.iter_char_range("a", "z") - self.assertIsInstance(result, Generator) - self.assertTupleEqual( - tuple(result), - tuple("abcdefghijklmnopqrstuvwxyz"), - ) - - def test_char_range(self): - result = regex_toolkit.char_range("a", "z") - self.assertIsInstance(result, tuple) - self.assertTupleEqual( - result, - tuple("abcdefghijklmnopqrstuvwxyz"), - ) +SORT_BY_LEN_TEXTS = [ + "apple", + "orange", + "banana", + "grape", + "apricot", + "cherry", + "plum", + "blueberry", + "strawberry", + "blackberry", +] +SORT_BY_LEN_TEXTS_BY_TYPE = { + set: set(SORT_BY_LEN_TEXTS), + frozenset: frozenset(SORT_BY_LEN_TEXTS), + tuple: tuple(SORT_BY_LEN_TEXTS), + list: list(SORT_BY_LEN_TEXTS), + dict: dict.fromkeys(SORT_BY_LEN_TEXTS, None), +} + + +@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items()) +@pytest.mark.parametrize("reverse", (False, True)) +def test_iter_sort_by_len(try_type, typed_texts, reverse): + expected_tuple = tuple(sorted(typed_texts, key=len, reverse=reverse)) + assert is_sorted_by_len(expected_tuple, reverse=reverse) + + actual = regex_toolkit.iter_sort_by_len(typed_texts, reverse=reverse) + actual_tuple = tuple(actual) + assert isinstance(actual, Generator) and (actual_tuple == expected_tuple), { + "try_type": try_type, + "typed_texts": typed_texts, + "reverse": reverse, + "actual_tuple": actual_tuple, + "expected_tuple": expected_tuple, + } + + +@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items()) +@pytest.mark.parametrize("reverse", (False, True)) +def test_sort_by_len(try_type, typed_texts, reverse): + expected = tuple(sorted(typed_texts, key=len, reverse=reverse)) + assert is_sorted_by_len(expected, reverse=reverse) + + actual = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) + assert isinstance(actual, tuple) and (actual == expected), { + "try_type": try_type, + "typed_texts": typed_texts, + "reverse": reverse, + "actual": actual, + "expected": expected, + } + + +ITER_CHAR_RANGE_CASES = [ + # Single char + (("a", "a"), ("a",)), + # Basic range + (("a", "d"), ("a", "b", "c", "d")), + # Reverse range + (("d", "a"), ("d", "c", "b", "a")), + # Single char (non-ASCII) + (("🐢", "🐺"), ("🐢", "🐷", "🐸", "🐹", "🐺")), +] + + +@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES) +def test_char_range(char_range, expected): + actual = regex_toolkit.char_range(*char_range) + assert isinstance(actual, tuple) + assert actual == expected, { + "char_range": char_range, + "actual": actual, + "expected": expected, + } + + +@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES) +def test_iter_char_range(char_range, expected): + actual = regex_toolkit.iter_char_range(*char_range) + assert isinstance(actual, Generator) + actual_tuple = tuple(actual) + assert actual_tuple == expected, { + "char_range": char_range, + "actual_tuple": actual_tuple, + "expected": expected, + } + + +@pytest.mark.parametrize( + "text, expected", + ( + # Empty string + ("", ""), + # Already NFC + ("a", "a"), + # Already NFC (non-ASCII) + ("🐢🐾", "🐢🐾"), + # Basic combining char (acute accent) + ("a\u0301", "Γ‘"), + # Multiple combining chars (diaeresis and acute accent) + ("o\u0308\u0301", "ấ"), + ), +) +def test_to_nfc(text, expected): + actual = regex_toolkit.to_nfc(text) + assert isinstance(actual, str) + assert actual == expected, { + "text": text, + "actual": actual, + "expected": expected, + } class TestMasking(unittest.TestCase):