diff --git a/.gitignore b/.gitignore
index 5013d26..b04526a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
!environment.yml
!codecov.yml
!requirements-doc.txt
+!requirements-test.txt
!src/
!src/*
diff --git a/Makefile b/Makefile
index 8392872..9115bef 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
PYTHON=python3
-APP_NAME=regex-toolkit
+APP_NAME=regex_toolkit
install:
${PYTHON} -m pip install .
diff --git a/README.md b/README.md
index 1be1bae..74375b0 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/regex-toolkit/):
[](https://pypi.org/project/regex-toolkit/)
```bash
-$ python3 -m pip install regex-toolkit
+python3 -m pip install regex-toolkit
```
Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit):
@@ -48,26 +48,49 @@ Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit):
```bash
-$ git clone git+https://github.com/Phosmic/regex-toolkit.git
-$ cd regex-toolkit
-$ python3 -m pip install -e .
+git clone git+https://github.com/Phosmic/regex-toolkit.git
+cd regex-toolkit
+python3 -m pip install -e .
```
---
## Usage
-Import packages:
+To harness the toolkit's capabilities, you should import the necessary packages:
```python
import re
# and/or
import re2
+import regex_toolkit as rtk
```
-```python
-import regex_toolkit
-```
+### Why Use `regex_toolkit`?
+
+Regex definitions vary across languages and versions.
+By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support.
+It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards.
+
+### RE2 Overview
+
+RE2 focuses on safely processing regular expressions, particularly from untrusted inputs.
+It ensures both linear match time and efficient memory usage.
+Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions.
+
+A brief rundown of RE2 terminology:
+
+- **BitState**: An execution engine that uses backtracking search.
+- **bytecode**: The set of instructions that form an automaton.
+- **DFA**: The engine for Deterministic Finite Automaton searches.
+- **NFA**: Implements the Nondeterministic Finite Automaton search method.
+- **OnePass**: A one-pass search execution engine.
+- **pattern**: The textual form of a regex.
+- **Prog**: The compiled version of a regex.
+- **Regexp**: The parsed version of a regex.
+- **Rune**: A character in terms of encoding, essentially a code point.
+
+For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary).
---
@@ -77,6 +100,39 @@ import regex_toolkit
# `regex_toolkit.utils`
+
+
+#### `resolve_flavor`
+
+```python
+def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor
+```
+
+Resolve a regex flavor.
+
+If the flavor is an integer, it is validated and returned.
+If the flavor is a RegexFlavor, it is returned.
+If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`.
+
+```python
+import regex_toolkit as rtk
+
+rtk.base.default_flavor = 2
+assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2
+```
+
+**Arguments**:
+
+- `potential_flavor` _int | RegexFlavor | None_ - Potential regex flavor.
+
+**Returns**:
+
+- _RegexFlavor_ - Resolved regex flavor.
+
+**Raises**:
+
+- `ValueError` - Invalid regex flavor.
+
#### `iter_sort_by_len`
@@ -134,8 +190,8 @@ The codepoint is always 8 characters long (zero-padded).
**Example**:
```python
-# Output: '00000061'
ord_to_cpoint(97)
+# Output: '00000061'
```
**Arguments**:
@@ -177,8 +233,8 @@ Character to character codepoint.
**Example**:
```python
-# Output: '00000061'
char_to_cpoint("a")
+# Output: '00000061'
```
**Arguments**:
@@ -201,6 +257,13 @@ Normalize a Unicode string to NFC form C.
Form C favors the use of a fully combined character.
+**Example**:
+
+```python
+to_nfc("e\\u0301") == "Γ©"
+# Output: True
+```
+
**Arguments**:
- `text` _str_ - String to normalize.
@@ -214,39 +277,59 @@ Form C favors the use of a fully combined character.
#### `iter_char_range`
```python
-def iter_char_range(first_cpoint: int,
- last_cpoint: int) -> Generator[str, None, None]
+def iter_char_range(first_char: str,
+ last_char: str) -> Generator[str, None, None]
```
-Iterate all characters within a range of codepoints (inclusive).
+Iterate all characters within a range of characters (inclusive).
+
+**Example**:
+
+```python
+char_range("a", "c")
+# Output: ('a', 'b', 'c')
+
+char_range("c", "a")
+# Output: ('c', 'b', 'a')
+```
**Arguments**:
-- `first_cpoint` _int_ - Starting (first) codepoint.
-- `last_cpoint` _int_ - Ending (last) codepoint.
+- `first_char` _str_ - Starting (first) character.
+- `last_char` _str_ - Ending (last) character.
**Yields**:
-- _str_ - Characters within a range of codepoints.
+- _str_ - Characters within a range of characters.
#### `char_range`
```python
-def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]
+def char_range(first_char: str, last_char: str) -> tuple[str, ...]
```
-Tuple of all characters within a range of codepoints (inclusive).
+Tuple of all characters within a range of characters (inclusive).
+
+**Example**:
+
+```python
+char_range("a", "d")
+# Output: ('a', 'b', 'c', 'd')
+
+char_range("d", "a")
+# Output: ('d', 'c', 'b', 'a')
+```
**Arguments**:
-- `first_cpoint` _int_ - Starting (first) codepoint.
-- `last_cpoint` _int_ - Ending (last) codepoint.
+- `first_char` _str_ - Starting (first) character.
+- `last_char` _str_ - Ending (last) character.
**Returns**:
-- _tuple[str, ...]_ - Characters within a range of codepoints.
+- _tuple[str, ...]_ - Characters within a range of characters.
@@ -303,7 +386,7 @@ Todo: Add support for overlapping (and unordered?) spans.
#### `escape`
```python
-def escape(char: str, flavor: int = 1) -> str
+def escape(char: str, flavor: int | None = None) -> str
```
Create a regex expression that exactly matches a character.
@@ -311,7 +394,7 @@ Create a regex expression that exactly matches a character.
**Arguments**:
- `char` _str_ - Character to match.
-- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
+- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None.
**Returns**:
@@ -326,7 +409,7 @@ Create a regex expression that exactly matches a character.
#### `string_as_exp`
```python
-def string_as_exp(text: str, flavor: int = 1) -> str
+def string_as_exp(text: str, flavor: int | None = None) -> str
```
Create a regex expression that exactly matches a string.
@@ -334,7 +417,7 @@ Create a regex expression that exactly matches a string.
**Arguments**:
- `text` _str_ - String to match.
-- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
+- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None.
**Returns**:
@@ -349,7 +432,7 @@ Create a regex expression that exactly matches a string.
#### `strings_as_exp`
```python
-def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str
+def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str
```
Create a regex expression that exactly matches any one string.
@@ -357,7 +440,7 @@ Create a regex expression that exactly matches any one string.
**Arguments**:
- `texts` _Iterable[str]_ - Strings to match.
-- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
+- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None.
**Returns**:
@@ -367,6 +450,39 @@ Create a regex expression that exactly matches any one string.
- `ValueError` - Invalid regex flavor.
+
+
+#### `make_exp`
+
+```python
+def make_exp(chars: Iterable[str], flavor: int | None = None) -> str
+```
+
+Create a regex expression that exactly matches a list of characters.
+
+The characters are sorted and grouped into ranges where possible.
+The expression is not anchored, so it can be used as part of a larger expression.
+
+**Example**:
+
+```python
+exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]"
+# Output: '[a-cx-z]'
+```
+
+**Arguments**:
+
+- `chars` _Iterable[str]_ - Characters to match.
+- `flavor` _int | None, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to None.
+
+**Returns**:
+
+- _str_ - Expression that exactly matches the original characters.
+
+**Raises**:
+
+- `ValueError` - Invalid regex flavor.
+
# `regex_toolkit.enums`
diff --git a/ci/deps/actions-310.yml b/ci/deps/actions-310.yml
index a3a6672..8bd2b2a 100644
--- a/ci/deps/actions-310.yml
+++ b/ci/deps/actions-310.yml
@@ -8,8 +8,5 @@ dependencies:
- pytest>=7.0.0
- pytest-cov
- pytest-xdist>=2.2.0
- # - pytest-asyncio>=0.17
-
- # Required dependencies
- pip:
- google-re2>=1.0
diff --git a/ci/deps/actions-311.yml b/ci/deps/actions-311.yml
index 4a16510..b816df2 100644
--- a/ci/deps/actions-311.yml
+++ b/ci/deps/actions-311.yml
@@ -8,8 +8,5 @@ dependencies:
- pytest>=7.0.0
- pytest-cov
- pytest-xdist>=2.2.0
- # - pytest-asyncio>=0.17
-
- # Required dependencies
- pip:
- google-re2>=1.0
diff --git a/docs/templates/install.md.jinja b/docs/templates/install.md.jinja
index dcc34d3..7a86e51 100644
--- a/docs/templates/install.md.jinja
+++ b/docs/templates/install.md.jinja
@@ -5,7 +5,7 @@ Most stable version from [**PyPi**](https://pypi.org/project/{{ pypi.name }}/):
[](https://pypi.org/project/{{ pypi.name }}/)
```bash
-$ python3 -m pip install {{ pypi.name }}
+python3 -m pip install {{ pypi.name }}
```
Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ repo.name }}):
@@ -21,7 +21,7 @@ Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ rep
{% endif %}
```bash
-$ git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git
-$ cd {{ repo.name }}
-$ python3 -m pip install -e .
+git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git
+cd {{ repo.name }}
+python3 -m pip install -e .
```
\ No newline at end of file
diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja
index 393c0d8..e7166fe 100644
--- a/docs/templates/usage.md.jinja
+++ b/docs/templates/usage.md.jinja
@@ -1,11 +1,34 @@
-Import packages:
+To harness the toolkit's capabilities, you should import the necessary packages:
```python
import re
# and/or
import re2
+import regex_toolkit as rtk
```
-```python
-import regex_toolkit
-```
\ No newline at end of file
+### Why Use `regex_toolkit`?
+
+Regex definitions vary across languages and versions.
+By using the toolkit, you can achieve a more consistent and comprehensive representation of unicode support.
+It is especially useful to supplement base unicode sets with the latest definitions from other languages and standards.
+
+### RE2 Overview
+
+RE2 focuses on safely processing regular expressions, particularly from untrusted inputs.
+It ensures both linear match time and efficient memory usage.
+Although it might not always surpass other engines in speed, it intentionally omits features that depend solely on backtracking, like backreferences and look-around assertions.
+
+A brief rundown of RE2 terminology:
+
+- **BitState**: An execution engine that uses backtracking search.
+- **bytecode**: The set of instructions that form an automaton.
+- **DFA**: The engine for Deterministic Finite Automaton searches.
+- **NFA**: Implements the Nondeterministic Finite Automaton search method.
+- **OnePass**: A one-pass search execution engine.
+- **pattern**: The textual form of a regex.
+- **Prog**: The compiled version of a regex.
+- **Regexp**: The parsed version of a regex.
+- **Rune**: A character in terms of encoding, essentially a code point.
+
+For an in-depth exploration, please refer to the [RE2 documentation](https://github.com/google/re2/wiki/Glossary).
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 367bdfc..09760df 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,10 +10,7 @@ dependencies:
- pytest>=7.0.0
- pytest-cov
- pytest-xdist>=2.2.0
- # - pytest-asyncio>=0.17
- coverage
-
- # Required dependencies
- pip:
- google-re2>=1.0
diff --git a/pyproject.toml b/pyproject.toml
index fe46401..3806815 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
"Topic :: Software Development :: Libraries",
"Topic :: Software Development :: Libraries :: Python Modules",
]
-dependencies = ["google-re2>=1.0"]
+dependencies = []
dynamic = ["version"]
[project.urls]
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..bed0b62
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,4 @@
+pytest>=7.0.0
+pytest-cov
+pytest-xdist>=2.2.0
+google-re2>=1.0
diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py
index de90ba6..7303ef7 100644
--- a/src/regex_toolkit/__init__.py
+++ b/src/regex_toolkit/__init__.py
@@ -1,8 +1,10 @@
from .base import (
escape,
+ make_exp,
string_as_exp,
strings_as_exp,
)
+from .enums import RegexFlavor
from .utils import (
char_range,
char_to_cpoint,
@@ -17,18 +19,20 @@
to_utf8,
)
-__version__ = "0.0.4"
+__version__ = "0.0.5"
__all__ = [
- "escape",
"char_range",
"char_to_cpoint",
"cpoint_to_ord",
+ "escape",
"iter_char_range",
"iter_sort_by_len",
+ "make_exp",
"mask_span",
"mask_spans",
"ord_to_cpoint",
+ "RegexFlavor",
"sort_by_len",
"string_as_exp",
"strings_as_exp",
diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py
index ca6a983..d59aea2 100644
--- a/src/regex_toolkit/base.py
+++ b/src/regex_toolkit/base.py
@@ -1,38 +1,21 @@
__all__ = [
+ "default_flavor",
"escape",
"string_as_exp",
"strings_as_exp",
+ "make_exp",
]
from collections.abc import Iterable
from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE
from regex_toolkit.enums import RegexFlavor
-from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len
+from regex_toolkit.utils import (
+ char_to_cpoint,
+ iter_sort_by_len,
+ resolve_flavor,
+)
-
-def escape(char: str, flavor: int = 1) -> str:
- """Create a regex expression that exactly matches a character.
-
- Args:
- char (str): Character to match.
- flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
-
- Returns:
- str: Expression that exactly matches the original character.
-
- Raises:
- ValueError: Invalid regex flavor.
- """
- try:
- flavor = RegexFlavor(flavor)
- except ValueError:
- raise ValueError(f"Invalid regex flavor: {flavor}")
-
- if flavor == RegexFlavor.RE:
- return _escape(char)
- # elif flavor == RegexFlavor.RE2:
- else:
- return _escape2(char)
+default_flavor: int | RegexFlavor | None = RegexFlavor.RE
def _escape(char: str) -> str:
@@ -53,32 +36,25 @@ def _escape2(char: str) -> str:
return f"\\{char}"
else:
# Otherwise escape using the codepoint
- return "\\x{" + char_to_cpoint(char) + "}"
+ return "\\x{" + char_to_cpoint(char).removeprefix("0000") + "}"
-def string_as_exp(text: str, flavor: int = 1) -> str:
- """Create a regex expression that exactly matches a string.
+def escape(char: str, flavor: int | None = None) -> str:
+ """Create a regex expression that exactly matches a character.
Args:
- text (str): String to match.
- flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
+ char (str): Character to match.
+ flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None.
Returns:
- str: Expression that exactly matches the original string.
+ str: Expression that exactly matches the original character.
Raises:
ValueError: Invalid regex flavor.
"""
- try:
- flavor = RegexFlavor(flavor)
- except ValueError:
- raise ValueError(f"Invalid regex flavor: {flavor}")
-
- if flavor == RegexFlavor.RE:
- return _string_as_exp(text)
- # elif flavor == RegexFlavor.RE2:
- else:
- return _string_as_exp2(text)
+ if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE:
+ return _escape(char)
+ return _escape2(char)
def _string_as_exp(text: str) -> str:
@@ -89,12 +65,38 @@ def _string_as_exp2(text: str) -> str:
return r"".join(map(_escape2, text))
-def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str:
+def string_as_exp(text: str, flavor: int | None = None) -> str:
+ """Create a regex expression that exactly matches a string.
+
+ Args:
+ text (str): String to match.
+ flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None.
+
+ Returns:
+ str: Expression that exactly matches the original string.
+
+ Raises:
+ ValueError: Invalid regex flavor.
+ """
+ if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE:
+ return _string_as_exp(text)
+ return _string_as_exp2(text)
+
+
+def _strings_as_exp(texts: Iterable[str]) -> str:
+ return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True)))
+
+
+def _strings_as_exp2(texts: Iterable[str]) -> str:
+ return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True)))
+
+
+def strings_as_exp(texts: Iterable[str], flavor: int | None = None) -> str:
"""Create a regex expression that exactly matches any one string.
Args:
texts (Iterable[str]): Strings to match.
- flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1.
+ flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None.
Returns:
str: Expression that exactly matches any one of the original strings.
@@ -102,21 +104,70 @@ def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str:
Raises:
ValueError: Invalid regex flavor.
"""
- try:
- flavor = RegexFlavor(flavor)
- except ValueError:
- raise ValueError(f"Invalid regex flavor: {flavor}")
-
- if flavor == RegexFlavor.RE:
+ if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE:
return _strings_as_exp(texts)
- # elif flavor == RegexFlavor.RE2:
+ return _strings_as_exp2(texts)
+
+
+def _make_group_exp(group: list[int]) -> str:
+ if len(group) > 2:
+ # Represent as a character range
+ return _escape(chr(group[0])) + "-" + _escape(chr(group[-1]))
else:
- return _strings_as_exp2(texts)
+ # Represent as individual characters
+ return "".join((_escape(chr(char_ord)) for char_ord in group))
-def _strings_as_exp(texts: Iterable[str]) -> str:
- return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True)))
+def _make_group_exp2(group: list[int]) -> str:
+ if len(group) > 2:
+ # Represent as a character range
+ return _escape2(chr(group[0])) + "-" + _escape2(chr(group[-1]))
+ else:
+ # Represent as individual characters
+ return "".join((_escape2(chr(char_ord)) for char_ord in group))
-def _strings_as_exp2(texts: Iterable[str]) -> str:
- return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True)))
+def make_exp(chars: Iterable[str], flavor: int | None = None) -> str:
+ """Create a regex expression that exactly matches a list of characters.
+
+ The characters are sorted and grouped into ranges where possible.
+ The expression is not anchored, so it can be used as part of a larger expression.
+
+ Example:
+
+ ```python
+ exp = "[" + make_exp(["a", "b", "c", "z", "y", "x"]) + "]"
+ # Output: '[a-cx-z]'
+ ```
+
+ Args:
+ chars (Iterable[str]): Characters to match.
+ flavor (int | None, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to None.
+
+ Returns:
+ str: Expression that exactly matches the original characters.
+
+ Raises:
+ ValueError: Invalid regex flavor.
+ """
+ if (flavor := resolve_flavor(flavor)) == RegexFlavor.RE:
+ func = _make_group_exp
+ func = _make_group_exp2
+
+ exp = ""
+ group = []
+ for char_ord in sorted(set(map(ord, chars))):
+ if not group:
+ # Start first group
+ group.append(char_ord)
+ elif char_ord == group[-1] + 1:
+ # Add to current group
+ group.append(char_ord)
+ else:
+ # Make the group and start a new one
+ exp += func(group)
+ group = [char_ord]
+ if group:
+ # Make any remaining group
+ exp += func(group)
+ return exp
diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py
index 0d7cc43..542a922 100644
--- a/src/regex_toolkit/constants.py
+++ b/src/regex_toolkit/constants.py
@@ -2,16 +2,19 @@
This module contains constant values used throughout the project.
"""
+from __future__ import annotations
-from typing import Final
+from typing import TYPE_CHECKING
-from regex_toolkit.enums import RegexFlavor
+if TYPE_CHECKING:
+ from typing import Final
__all__ = [
"ALWAYS_ESCAPE",
"ALWAYS_SAFE",
"ASCIILETTERS",
"DIGITS",
+ "RESERVED_EXPRESSIONS",
]
DIGITS: Final[frozenset[str]] = frozenset(map(chr, b"0123456789"))
@@ -22,7 +25,6 @@
ALWAYS_ESCAPE: Final[frozenset[str]] = frozenset(
map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f")
)
-
-REGEX_FLAVORS: Final[frozenset[RegexFlavor]] = frozenset(
- {RegexFlavor.RE, RegexFlavor.RE2}
+RESERVED_EXPRESSIONS: Final[frozenset[str]] = frozenset(
+ {"\\A", "\\b", "\\B", "\\d", "\\D", "\\s", "\\S", "\\w", "\\W", "\\Z", "\\1"}
)
diff --git a/src/regex_toolkit/enums.py b/src/regex_toolkit/enums.py
index 55a9b9d..4f7c8cc 100644
--- a/src/regex_toolkit/enums.py
+++ b/src/regex_toolkit/enums.py
@@ -2,6 +2,11 @@
from enum import Enum
+__all__ = [
+ "ALL_REGEX_FLAVORS",
+ "RegexFlavor",
+]
+
class RegexFlavor(int, Enum):
"""Regex flavors.
@@ -13,3 +18,6 @@ class RegexFlavor(int, Enum):
RE = 1
RE2 = 2
+
+
+ALL_REGEX_FLAVORS: list[RegexFlavor] = [RegexFlavor.RE, RegexFlavor.RE2]
diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py
index 41c9df3..81a35d5 100644
--- a/src/regex_toolkit/utils.py
+++ b/src/regex_toolkit/utils.py
@@ -1,21 +1,60 @@
import unicodedata
from collections.abc import Generator, Iterable
+import regex_toolkit.base
+from regex_toolkit.enums import RegexFlavor
+
__all__ = [
- "iter_sort_by_len",
- "sort_by_len",
- "ord_to_cpoint",
- "cpoint_to_ord",
+ "char_range",
"char_to_cpoint",
- "to_utf8",
- "to_nfc",
+ "cpoint_to_ord",
"iter_char_range",
- "char_range",
+ "iter_sort_by_len",
"mask_span",
"mask_spans",
+ "ord_to_cpoint",
+ "resolve_flavor",
+ "sort_by_len",
+ "to_nfc",
+ "to_utf8",
]
+def resolve_flavor(potential_flavor: int | RegexFlavor | None) -> RegexFlavor:
+ """Resolve a regex flavor.
+
+ If the flavor is an integer, it is validated and returned.
+ If the flavor is a RegexFlavor, it is returned.
+ If the flavor is None, the default flavor is returned. To change the default flavor, set `default_flavor`.
+
+ ```python
+ import regex_toolkit as rtk
+
+ rtk.base.default_flavor = 2
+ assert rtk.utils.resolve_flavor(None) == rtk.enums.RegexFlavor.RE2
+ ```
+
+ Args:
+ potential_flavor (int | RegexFlavor | None): Potential regex flavor.
+
+ Returns:
+ RegexFlavor: Resolved regex flavor.
+
+ Raises:
+ ValueError: Invalid regex flavor.
+ """
+ try:
+ return RegexFlavor(potential_flavor)
+ except ValueError as err:
+ if regex_toolkit.base.default_flavor is not None:
+ try:
+ return RegexFlavor(regex_toolkit.base.default_flavor)
+ except ValueError as err:
+ raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err
+ else:
+ raise ValueError(f"Invalid regex flavor: {potential_flavor}") from err
+
+
def iter_sort_by_len(
texts: Iterable[str],
*,
@@ -59,8 +98,8 @@ def ord_to_cpoint(ordinal: int) -> str:
Example:
```python
- # Output: '00000061'
ord_to_cpoint(97)
+ # Output: '00000061'
```
Args:
@@ -90,8 +129,8 @@ def char_to_cpoint(char: str) -> str:
Example:
```python
- # Output: '00000061'
char_to_cpoint("a")
+ # Output: '00000061'
```
Args:
@@ -112,6 +151,13 @@ def to_nfc(text: str) -> str:
Form C favors the use of a fully combined character.
+ Example:
+
+ ```python
+ to_nfc("e\\u0301") == "Γ©"
+ # Output: True
+ ```
+
Args:
text (str): String to normalize.
@@ -121,31 +167,57 @@ def to_nfc(text: str) -> str:
return unicodedata.normalize("NFC", text)
-def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]:
- """Iterate all characters within a range of codepoints (inclusive).
+def iter_char_range(first_char: str, last_char: str) -> Generator[str, None, None]:
+ """Iterate all characters within a range of characters (inclusive).
+
+ Example:
+
+ ```python
+ char_range("a", "c")
+ # Output: ('a', 'b', 'c')
+
+ char_range("c", "a")
+ # Output: ('c', 'b', 'a')
+ ```
Args:
- first_cpoint (int): Starting (first) codepoint.
- last_cpoint (int): Ending (last) codepoint.
+ first_char (str): Starting (first) character.
+ last_char (str): Ending (last) character.
Yields:
- str: Characters within a range of codepoints.
+ str: Characters within a range of characters.
"""
- for i in range(ord(first_cpoint), ord(last_cpoint) + 1):
- yield chr(i)
+ first_ord = ord(first_char)
+ last_ord = ord(last_char)
+ if first_ord > last_ord:
+ ord_range = range(first_ord, last_ord - 1, -1)
+ else:
+ ord_range = range(first_ord, last_ord + 1)
+ for ordinal in ord_range:
+ yield chr(ordinal)
-def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]:
- """Tuple of all characters within a range of codepoints (inclusive).
+def char_range(first_char: str, last_char: str) -> tuple[str, ...]:
+ """Tuple of all characters within a range of characters (inclusive).
+
+ Example:
+
+ ```python
+ char_range("a", "d")
+ # Output: ('a', 'b', 'c', 'd')
+
+ char_range("d", "a")
+ # Output: ('d', 'c', 'b', 'a')
+ ```
Args:
- first_cpoint (int): Starting (first) codepoint.
- last_cpoint (int): Ending (last) codepoint.
+ first_char (str): Starting (first) character.
+ last_char (str): Ending (last) character.
Returns:
- tuple[str, ...]: Characters within a range of codepoints.
+ tuple[str, ...]: Characters within a range of characters.
"""
- return tuple(iter_char_range(first_cpoint, last_cpoint))
+ return tuple(iter_char_range(first_char, last_char))
def mask_span(
diff --git a/tests/test_base.py b/tests/test_base.py
index afc1259..4020eae 100644
--- a/tests/test_base.py
+++ b/tests/test_base.py
@@ -1,555 +1,318 @@
+# import random
import re
-import unittest
+from collections.abc import Iterable
from itertools import product
+import pytest
import re2
import regex_toolkit
-from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE
-from regex_toolkit.enums import RegexFlavor
-
-
-class TestEscapeRE(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE
- self._re_compile = re.compile
-
- def test_safe(self):
- for char in ALWAYS_SAFE:
- with self.subTest(char=char):
- expected_exp = char
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
- def test_escapable(self):
- for char in ALWAYS_ESCAPE:
- with self.subTest(char=char):
- expected_exp = f"\\{char}"
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
- def test_unknown(self):
- # TODO: Include additional characters to test.
- for char in "π
°π
±π
Ύπ
Ώππππππππππππ¦π§π¨ππππ―π²π³π΄π΅πΆπ·πΈπΉπΊππππππππ
":
- with self.subTest(char=char):
- expected_exp = f"\\{char}"
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
-
-class TestEscapeRE2(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE2
- self._re_compile = re2.compile
-
- def test_safe(self):
- for char in ALWAYS_SAFE:
- with self.subTest(char=char):
- expected_exp = char
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
- def test_escapable(self):
- for char in ALWAYS_ESCAPE:
- with self.subTest(char=char):
- expected_exp = f"\\{char}"
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
- def test_unknown(self):
- # TODO: Include additional characters to test.
- # NOTE: Same as running: "\\x{" + format(ord("π"), "x").zfill(8) + "}"
- for char, expected_exp in (
- # Length 1
- ("π
°", r"\x{0001f170}"),
- ("π
±", r"\x{0001f171}"),
- ("π
Ύ", r"\x{0001f17e}"),
- ("π
Ώ", r"\x{0001f17f}"),
- ("π", r"\x{0001f18e}"),
- ("π", r"\x{0001f191}"),
- ("π", r"\x{0001f192}"),
- ("π", r"\x{0001f193}"),
- ("π", r"\x{0001f194}"),
- ("π", r"\x{0001f195}"),
- ("π", r"\x{0001f196}"),
- ("π", r"\x{0001f197}"),
- ("π", r"\x{0001f198}"),
- ("π", r"\x{0001f199}"),
- ("π", r"\x{0001f19a}"),
- ("π¦", r"\x{0001f1e6}"),
- ("π§", r"\x{0001f1e7}"),
- ("π¨", r"\x{0001f1e8}"),
- ("π", r"\x{0001f201}"),
- ("π", r"\x{0001f202}"),
- ("π", r"\x{0001f21a}"),
- ("π―", r"\x{0001f22f}"),
- ("π²", r"\x{0001f232}"),
- ("π³", r"\x{0001f233}"),
- ("π΄", r"\x{0001f234}"),
- ("π΅", r"\x{0001f235}"),
- ("πΆ", r"\x{0001f236}"),
- ("π·", r"\x{0001f237}"),
- ("πΈ", r"\x{0001f238}"),
- ("πΉ", r"\x{0001f239}"),
- ("πΊ", r"\x{0001f23a}"),
- ("π", r"\x{0001f250}"),
- ("π", r"\x{0001f251}"),
- ("π", r"\x{0001f300}"),
- ("π", r"\x{0001f301}"),
- ("π", r"\x{0001f302}"),
- ("π", r"\x{0001f303}"),
- ("π", r"\x{0001f304}"),
- # Length 2
- ("π
", r"\x{0001f305}"),
- ):
- with self.subTest(char=char):
- actual_exp = regex_toolkit.escape(char, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the character.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(char))
-
-
-class TestStringAsExpressionRE(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE
- self._re_compile = re.compile
-
- def test_safe_individual_char(self):
- # Single character.
- for char in ALWAYS_SAFE:
- with self.subTest(char=char):
- text = char
- expected_exp = char
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- # TODO: Add tests for mix of characters.
- def test_safe_joined_as_one(self):
- # All characters.
- text = "".join(ALWAYS_SAFE)
- expected_exp = text
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_escapable_individual_char(self):
- # Single character.
- for char in ALWAYS_ESCAPE:
- with self.subTest(char=char):
- text = char
- expected_exp = f"\\{char}"
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_escapable_joined_as_one(self):
- # All characters.
- text = "".join(ALWAYS_ESCAPE)
- expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE)
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_unsafe_joined_as_one(self):
- # All characters.
- text = "π
°π
±π
Ύπ
Ώππππππππππππ¦π§π¨ππππ―π²π³π΄π΅πΆπ·πΈπΉπΊππππππππ
"
- expected_exp = "".join(f"\\{char}" for char in text)
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
-
-class TestStringAsExpressionRE2(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE2
- self._re_compile = re2.compile
-
- # TODO: Add tests for mix of characters.
- def test_safe_individual_char(self):
- # Single character.
- for char in ALWAYS_SAFE:
- with self.subTest(char=char):
- text = char
- expected_exp = char
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_safe_joined_as_one(self):
- # All characters.
- text = "".join(ALWAYS_SAFE)
- expected_exp = "".join(ALWAYS_SAFE)
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_escapable_individual_char(self):
- # Single character.
- for char in ALWAYS_ESCAPE:
- with self.subTest(char=char):
- text = char
- expected_exp = f"\\{char}"
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_escapable_joined_as_one(self):
- # All characters.
- text = "".join(ALWAYS_ESCAPE)
- expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE)
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
-
- def test_unknown_joined_as_one(self):
- text = "π
°π
±π
Ύπ
Ώππππππππππππ¦π§π¨ππππ―π²π³π΄π΅πΆπ·πΈπΉπΊππππππππ
"
- expected_exp = r"".join(
- (
- r"\x{0001f170}",
- r"\x{0001f171}",
- r"\x{0001f17e}",
- r"\x{0001f17f}",
- r"\x{0001f18e}",
- r"\x{0001f191}",
- r"\x{0001f192}",
- r"\x{0001f193}",
- r"\x{0001f194}",
- r"\x{0001f195}",
- r"\x{0001f196}",
- r"\x{0001f197}",
- r"\x{0001f198}",
- r"\x{0001f199}",
- r"\x{0001f19a}",
- r"\x{0001f1e6}",
- r"\x{0001f1e7}",
- r"\x{0001f1e8}",
- r"\x{0001f201}",
- r"\x{0001f202}",
- r"\x{0001f21a}",
- r"\x{0001f22f}",
- r"\x{0001f232}",
- r"\x{0001f233}",
- r"\x{0001f234}",
- r"\x{0001f235}",
- r"\x{0001f236}",
- r"\x{0001f237}",
- r"\x{0001f238}",
- r"\x{0001f239}",
- r"\x{0001f23a}",
- r"\x{0001f250}",
- r"\x{0001f251}",
- r"\x{0001f300}",
- r"\x{0001f301}",
- r"\x{0001f302}",
- r"\x{0001f303}",
- r"\x{0001f304}",
- # Length 2
- r"\x{0001f305}",
- )
+from regex_toolkit.constants import (
+ ALWAYS_ESCAPE,
+ ALWAYS_SAFE,
+ RESERVED_EXPRESSIONS,
+)
+from regex_toolkit.enums import ALL_REGEX_FLAVORS, RegexFlavor
+
+NON_ASCII_CHARS = "π
°π
±π
Ύπ
Ώππππππππππππ¦π§π¨ππππ―π²π³π΄π΅πΆπ·πΈπΉπΊππππππππ
"
+
+
+def _exp_will_match(exp: str, text: str, flavor: int) -> bool:
+ if flavor == 1:
+ return bool(re.fullmatch(exp, text))
+ if flavor == 2:
+ return bool(re2.fullmatch(exp, text))
+ raise ValueError(f"Invalid regex flavor: {flavor!r}")
+
+
+def assert_exp_will_match(exp: str, text: str, flavor: int) -> bool:
+ assert _exp_will_match(
+ exp, text, flavor
+ ), f"RE{flavor} Pattern: {exp!r} does not match {text!r}"
+
+
+def assert_exp_will_match_all(exp: str, texts: Iterable[str], flavor: int) -> bool:
+ for text in texts:
+ assert_exp_will_match(exp, text, flavor)
+
+
+# RE and RE2 - Escape
+
+
+@pytest.mark.parametrize("char, expected", [(char, char) for char in ALWAYS_SAFE])
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_escape_and_escape2_safe(char, expected, flavor):
+ actual = regex_toolkit.escape(char, flavor)
+ assert actual == expected
+ assert_exp_will_match(actual, char, flavor)
+
+
+@pytest.mark.parametrize(
+ "char, expected_exp", [(char, f"\\{char}") for char in ALWAYS_ESCAPE]
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_escape_and_escape2_escapable(char, expected_exp, flavor):
+ actual = regex_toolkit.escape(char, flavor)
+ assert actual == expected_exp
+ assert_exp_will_match(actual, char, flavor)
+
+
+# RE - Escape
+
+
+@pytest.mark.parametrize(
+ "char, expected_exp",
+ [(char, f"\\{char}") for char in NON_ASCII_CHARS],
+)
+def test_escape_unknown(char, expected_exp):
+ actual = regex_toolkit.escape(char, RegexFlavor.RE)
+ assert actual == expected_exp
+ assert_exp_will_match(actual, char, RegexFlavor.RE)
+
+
+# RE2 - Escape
+
+
+@pytest.mark.parametrize(
+ "char, expected",
+ [
+ (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}")
+ for char in NON_ASCII_CHARS
+ ],
+)
+def test_escape2_unknown(char, expected):
+ actual = regex_toolkit.escape(char, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match(actual, char, RegexFlavor.RE2)
+
+
+def test_escape2_trimmed():
+ text = "Β°"
+ expected = "\\x{00b0}"
+ actual = regex_toolkit.escape(text, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE2)
+
+
+def test_escape2_untrimmed():
+ text = "π
°"
+ expected = "\\x{0001f170}"
+ actual = regex_toolkit.escape(text, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE2)
+
+
+# RE and RE2 - String as expression
+
+
+@pytest.mark.parametrize("text, expected", [(text, text) for text in ALWAYS_SAFE])
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_string_as_exp_and_exp2_safe_individual_char(text, expected, flavor):
+ actual = regex_toolkit.string_as_exp(text, flavor)
+ assert actual == expected
+ assert_exp_will_match(actual, text, flavor)
+
+
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_string_as_exp_and_exp2_safe_joined_as_one(flavor):
+ text = "".join(ALWAYS_SAFE)
+ expected = "".join(ALWAYS_SAFE)
+ actual = regex_toolkit.string_as_exp(text, flavor)
+ assert actual == expected
+ assert_exp_will_match(actual, text, flavor)
+
+
+@pytest.mark.parametrize(
+ "text, expected", [(char, f"\\{char}") for char in ALWAYS_ESCAPE]
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_string_as_exp_and_exp2_escapable_individual_char(text, expected, flavor):
+ actual = regex_toolkit.string_as_exp(text, flavor)
+ assert actual == expected
+ assert_exp_will_match(actual, text, flavor)
+
+
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_string_as_exp_and_exp2_escapable_joined_as_one(flavor):
+ text = "".join(ALWAYS_ESCAPE)
+ expected = "".join(f"\\{char}" for char in ALWAYS_ESCAPE)
+ actual = regex_toolkit.string_as_exp(text, flavor)
+ assert actual == expected
+ assert_exp_will_match(actual, text, flavor)
+
+
+# RE - String as expression
+
+
+@pytest.mark.parametrize(
+ "text, expected",
+ [(text, f"\\{text}") for text in NON_ASCII_CHARS],
+)
+def test_string_as_exp_unsafe_individual_char(text, expected):
+ actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE)
+
+
+def test_string_as_exp_unsafe_joined_as_one():
+ text = "".join(NON_ASCII_CHARS)
+ expected = "".join(f"\\{char}" for char in text)
+ actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE)
+
+
+# RE2 - String as expression
+
+
+@pytest.mark.parametrize(
+ "text, expected",
+ [
+ (char, "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}")
+ for char in NON_ASCII_CHARS
+ ],
+)
+def test_string_as_exp2_unknown_individual_char(text, expected):
+ actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE2)
+
+
+def test_string_as_exp2_unknown_joined_as_one():
+ text = "".join(NON_ASCII_CHARS)
+ expected = "".join(
+ "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}"
+ for char in text
+ )
+ actual = regex_toolkit.string_as_exp(text, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match(actual, text, RegexFlavor.RE2)
+
+
+# RE and RE2 - Strings as expression
+
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [(texts, r"|".join(texts)) for texts in product(ALWAYS_SAFE, repeat=2)],
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_strings_as_exp_and_exp2_safe_of_various_lengths(texts, expected, flavor):
+ actual = regex_toolkit.strings_as_exp(texts, flavor)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, flavor)
+
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [
+ (texts, r"|".join(f"\\{text}" for text in texts))
+ for texts in product(ALWAYS_ESCAPE, repeat=2)
+ ],
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_strings_as_exp_and_exp2_escapable_of_various_lengths(texts, expected, flavor):
+ actual = regex_toolkit.strings_as_exp(texts, flavor)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, flavor)
+
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [
+ (texts, r"|".join(f"\\{text}" for text in texts))
+ for texts in product(RESERVED_EXPRESSIONS, repeat=2)
+ ],
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_strings_as_exp_and_exp2_reserved_of_various_lengths(texts, expected, flavor):
+ actual = regex_toolkit.strings_as_exp(texts, flavor)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, flavor)
+
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [
+ (
+ texts,
+ r"|".join(text if text in ALWAYS_SAFE else f"\\{text}" for text in texts),
)
- actual_exp = regex_toolkit.string_as_exp(text, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches the string.
- pattern = self._re_compile(actual_exp)
- self.assertTrue(pattern.match(text))
+ for texts in product(ALWAYS_SAFE | ALWAYS_ESCAPE, repeat=2)
+ ],
+)
+@pytest.mark.parametrize("flavor", ALL_REGEX_FLAVORS)
+def test_strings_as_exp_and_exp2_safe_and_escapable_of_various_lengths(
+ texts, expected, flavor
+):
+ actual = regex_toolkit.strings_as_exp(texts, flavor)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, flavor)
+
+# RE - Strings as expression
-RESERVED_EXPRESSIONS = frozenset(
- {r"\A", r"\b", r"\B", r"\d", r"\D", r"\s", r"\S", r"\w", r"\W", r"\Z", r"\1"}
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [
+ (texts, r"|".join(f"\\{text}" for text in texts))
+ for texts in product(NON_ASCII_CHARS, repeat=2)
+ ],
)
+def test_strings_as_exp_unsafe_of_various_lengths(texts, expected):
+ actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, RegexFlavor.RE)
+
+# RE2 - Strings as expression
-class StringsAsExpressionRE(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE
- self._re_compile = re.compile
- self._max_combo_length = 2
-
- def test_safe_of_various_lengths(self):
- # Unique combinations of `ALWAYS_SAFE` using various lengths.
- elements = ALWAYS_SAFE
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_escapable_of_various_lengths(self):
- # Unique combinations of `ALWAYS_ESCAPE` using various lengths.
- elements = ALWAYS_ESCAPE
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(f"\\{text}" for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_reserved_of_various_lengths(self):
- # Unique combinations of reserved expressions using various lengths.
- # Exact matches that equate to reserved spaces
- # E.g. Should match '\\' + 'n', not r'\n'
- elements = RESERVED_EXPRESSIONS
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(f"\\{text}" for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_unsafe_of_various_lengths(self):
- # TODO: Include text/chars such as punctuation, etc.
- # Unique combinations of `ALWAYS_SAFE` using various lengths.
- elements = "π
°π
±π
Ύπ
Ώππππππππππππ¦π§π¨ππππ―π²π³π΄π΅πΆπ·πΈπΉπΊππππππππ
"
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(f"\\{text}" for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_safe_and_escapable_of_various_lengths(self):
- # Unique combinations of `ALWAYS_SAFE` and `ALWAYS_ESCAPE` using various lengths.
- elements = ALWAYS_SAFE | ALWAYS_ESCAPE
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(
- text if text in ALWAYS_SAFE else f"\\{text}" for text in texts
- )
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- # def test_actual_examples(self):
- #
-
-
-######################
-###################### # Multiple unsafe char
-###################### self.assertEqual(
-###################### regex_toolkit.strings_as_exp([".", "!", "?"], self._flavor),
-###################### "\\.|\\!|\\?",
-###################### )
-######################
-###################### for texts, expected_exp in [
-###################### (["π
°"], "\\π
°"),
-###################### (["π
°", "π
±"], "\\π
°|\\π
±"),
-###################### (["alpha", "beta"], "alpha|beta"),
-###################### (["π
°lpha", "π
±eta"], "\\π
°lpha|\\π
±eta"),
-###################### (["π
°lpha", "Beta"], "\\π
°lpha|Beta"),
-###################### ]:
-###################### self.assertEqual(
-###################### regex_toolkit.strings_as_exp(texts, self._flavor),
-###################### expected_exp,
-###################### )
-
-
-class StringsAsExpressionRE2(unittest.TestCase):
- def setUp(self):
- self._flavor = RegexFlavor.RE2
- self._re_compile = re2.compile
- self._max_combo_length = 2
-
- def test_safe_of_variable_lengths(self):
- # Unique combinations of ALWAYS_SAFE using various lengths.
- elements = set(ALWAYS_SAFE)
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_escapable_of_variable_lengths(self):
- # Unique combinations of ALWAYS_ESCAPE using various lengths.
- elements = ALWAYS_ESCAPE
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(f"\\{text}" for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_reserved_of_variable_lengths(self):
- # Unique combinations of reserved expressions using various lengths.
- # Exact matches that equate to reserved spaces
- # E.g. Should match '\\' + 'n', not r'\n'
- elements = RESERVED_EXPRESSIONS
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(f"\\{text}" for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
- def test_unsafe_of_variable_lengths(self):
- # TODO: Include text/chars such as punctuation, etc.
- # Unique combinations of ALWAYS_SAFE using various lengths.
- elements_map = {
- # Length 1
- "π
°": r"\x{0001f170}",
- "π
±": r"\x{0001f171}",
- "π
Ύ": r"\x{0001f17e}",
- "π
Ώ": r"\x{0001f17f}",
- "π": r"\x{0001f18e}",
- "π": r"\x{0001f191}",
- "π": r"\x{0001f192}",
- "π": r"\x{0001f193}",
- "π": r"\x{0001f194}",
- "π": r"\x{0001f195}",
- "π": r"\x{0001f196}",
- "π": r"\x{0001f197}",
- "π": r"\x{0001f198}",
- "π": r"\x{0001f199}",
- "π": r"\x{0001f19a}",
- "π¦": r"\x{0001f1e6}",
- "π§": r"\x{0001f1e7}",
- "π¨": r"\x{0001f1e8}",
- "π": r"\x{0001f201}",
- "π": r"\x{0001f202}",
- "π": r"\x{0001f21a}",
- "π―": r"\x{0001f22f}",
- "π²": r"\x{0001f232}",
- "π³": r"\x{0001f233}",
- "π΄": r"\x{0001f234}",
- "π΅": r"\x{0001f235}",
- "πΆ": r"\x{0001f236}",
- "π·": r"\x{0001f237}",
- "πΈ": r"\x{0001f238}",
- "πΉ": r"\x{0001f239}",
- "πΊ": r"\x{0001f23a}",
- "π": r"\x{0001f250}",
- "π": r"\x{0001f251}",
- "π": r"\x{0001f300}",
- "π": r"\x{0001f301}",
- "π": r"\x{0001f302}",
- "π": r"\x{0001f303}",
- "π": r"\x{0001f304}",
- # Length 2
- "π
": r"\x{0001f305}",
- }
- elements = tuple(elements_map)
- for texts in product(elements, repeat=self._max_combo_length):
- with self.subTest(texts=texts):
- expected_exp = r"|".join(elements_map[text] for text in texts)
- actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor)
- self.assertEqual(actual_exp, expected_exp)
- # Ensure the expression compiles and matches each of the strings.
- pattern = self._re_compile(actual_exp)
- for text in texts:
- with self.subTest("match pattern", text=text):
- self.assertTrue(pattern.match(text))
-
-
-##############################3 # Exact matches that equate to reserved spaces
-##############################3 # E.g. Should match '\\' + 'n', not r'\n'
-##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"):
-##############################3 texts = [text]
-##############################3 with self.subTest(texts=texts):
-##############################3 self.assertEqual(
-##############################3 regex_toolkit.strings_as_exp(texts, self._flavor),
-##############################3 f"\\{text}",
-##############################3 )
-##############################3
-##############################3 # Single whitespace char
-##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]):
-##############################3 with self.subTest(texts=texts):
-##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), texts[0])
-##############################3
-##############################3 # Single unsafe char
-##############################3 for texts, expected_exp in [
-##############################3 (["."], "\\."),
-##############################3 (["!"], "\\!"),
-##############################3 (["?"], "\\?"),
-##############################3 ]:
-##############################3 with self.subTest(texts=texts, expected_exp=expected_exp):
-##############################3 self.assertEqual(
-##############################3 regex_toolkit.strings_as_exp(texts, self._flavor),
-##############################3 expected_exp,
-##############################3 )
-##############################3
-##############################3 # Multiple unsafe char
-##############################3 texts = [".", "!", "?"]
-##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), "\\.|\\!|\\?")
-##############################3
-##############################3 for texts, expected_exp in [
-##############################3 (["π
°"], "\\x{0001f170}"),
-##############################3 (["π
°", "π
±"], "\\x{0001f170}|\\x{0001f171}"),
-##############################3 (["alpha", "beta"], "alpha|beta"),
-##############################3 (["π
°lpha", "π
±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"),
-##############################3 (["π
°lpha", "Beta"], "\\x{0001f170}lpha|Beta"),
-##############################3 ]:
-##############################3 with self.subTest(texts=texts, expected_exp=expected_exp):
-##############################3 self.assertEqual(
-##############################3 regex_toolkit.strings_as_exp(texts, self._flavor),
-##############################3 expected_exp,
-##############################3 )
-
-# TODO: Add tests for actually compiling the e.
+
+@pytest.mark.parametrize(
+ "texts, expected",
+ [
+ (
+ texts,
+ r"|".join(
+ "\\x{" + format(ord(char), "x").zfill(8).removeprefix("0000") + "}"
+ for char in texts
+ ),
+ )
+ for texts in product(*NON_ASCII_CHARS, repeat=2)
+ ],
+)
+def test_strings_as_exp2_unsafe_of_various_lengths(texts, expected):
+ actual = regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2)
+ assert actual == expected
+ assert_exp_will_match_all(actual, texts, RegexFlavor.RE2)
+
+
+# Make expression
+
+
+@pytest.mark.parametrize(
+ "chars, expected",
+ (
+ # 1 char does not make a range
+ (["a"], "a"),
+ # 2 chars should not make a range
+ (["a", "b"], "ab"),
+ # 3+ sequential chars make a range
+ (["a", "b", "c"], "a-c"),
+ # 3+ non-sequential chars should not make a range
+ (["a", "c", "e"], "ace"),
+ # 3+ sequential chars with extra out of range char
+ (["a", "b", "c", "z"], "a-cz"),
+ # Chars should always be ordered by ordinal
+ (["b", "a"], "ab"),
+ # Chars should always be ordered by ordinal
+ (["e", "c", "a"], "ace"),
+ # Chars should always be ordered by ordinal
+ (["z", "c", "b", "a"], "a-cz"),
+ # Duplicates should be removed
+ (["d", "a", "b", "c", "a"], "a-d"),
+ ),
+)
+def test_make_exp(chars, expected):
+ assert regex_toolkit.make_exp(chars, RegexFlavor.RE) == expected
diff --git a/tests/test_enums.py b/tests/test_enums.py
index 3af0dbd..d061d36 100644
--- a/tests/test_enums.py
+++ b/tests/test_enums.py
@@ -3,18 +3,23 @@
from regex_toolkit.enums import RegexFlavor
-def test_regex_flavor_enum_is_int():
- assert isinstance(RegexFlavor.RE, int)
- assert RegexFlavor.RE == 1
- assert RegexFlavor(1) == RegexFlavor.RE
- assert isinstance(RegexFlavor.RE2, int)
- assert RegexFlavor.RE2 == 2
- assert RegexFlavor(2) == RegexFlavor.RE2
+def test_regex_flavor_enum_has_expected_members():
+ assert len(RegexFlavor) == 2
+ assert len(set(RegexFlavor)) == len(RegexFlavor)
+ assert RegexFlavor.RE.name == "RE"
+ assert RegexFlavor.RE.value == RegexFlavor.RE == RegexFlavor(1) == 1
+ assert RegexFlavor(1) is RegexFlavor.RE
-def test_invalid_regex_flavor_raises_value_error():
- with pytest.raises(ValueError):
- RegexFlavor(0)
+ assert RegexFlavor.RE2.name == "RE2"
+ assert RegexFlavor.RE2 == RegexFlavor.RE2.value == RegexFlavor(2) == 2
+ assert RegexFlavor(2) is RegexFlavor.RE2
- with pytest.raises(ValueError):
- RegexFlavor(3)
+
+@pytest.mark.parametrize("invalid_flavor", (0, 3))
+def test_invalid_regex_flavor_raises_value_error(invalid_flavor):
+ with pytest.raises(
+ ValueError,
+ match=f"^{invalid_flavor} is not a valid RegexFlavor$",
+ ):
+ RegexFlavor(invalid_flavor)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c1fccdf..5c27b37 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,51 @@
import unittest
from collections.abc import Generator, Iterable
+from unittest import mock
+
+import pytest
import regex_toolkit
+from regex_toolkit.enums import RegexFlavor
+
+
+@pytest.mark.parametrize(
+ "potential_flavor, expected",
+ [
+ (1, RegexFlavor.RE),
+ (2, RegexFlavor.RE2),
+ (RegexFlavor.RE, RegexFlavor.RE),
+ (RegexFlavor.RE2, RegexFlavor.RE2),
+ (RegexFlavor(1), RegexFlavor.RE),
+ (RegexFlavor(2), RegexFlavor.RE2),
+ ],
+)
+def test_resolve_flavor_with_valid(potential_flavor, expected):
+ assert regex_toolkit.base.resolve_flavor(potential_flavor) == expected
+
+
+@mock.patch("regex_toolkit.base.default_flavor", None)
+def test_resolve_flavor_with_invalid_and_with_no_default_raises_value_error():
+ with pytest.raises(ValueError, match=r"^Invalid regex flavor: None$"):
+ regex_toolkit.base.resolve_flavor(None)
+
+
+@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"])
+@mock.patch("regex_toolkit.base.default_flavor", RegexFlavor.RE)
+def test_resolve_flavor_falls_back_to_default(potential_flavor):
+ regex_toolkit.base.resolve_flavor(potential_flavor) == RegexFlavor.RE
+
+
+@pytest.mark.parametrize("potential_flavor", [None, 0, 3, "1", "2"])
+@mock.patch("regex_toolkit.base.default_flavor", None)
+def test_resolve_flavor_invalid_int_without_default_raises(potential_flavor):
+ with pytest.raises(ValueError, match=r"^Invalid regex flavor: (None|'?\d'?)$"):
+ regex_toolkit.base.resolve_flavor(potential_flavor)
+
+
+@mock.patch("regex_toolkit.base.default_flavor", None)
+def test_default_flavor_can_be_set():
+ regex_toolkit.base.default_flavor = 2
+ assert regex_toolkit.base.resolve_flavor(None) == RegexFlavor.RE2
def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool:
@@ -19,81 +63,118 @@ def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool:
return True
-class TestSortByLength(unittest.TestCase):
- def setUp(self) -> None:
- self.texts = {
- "apple",
- "orange",
- "banana",
- "grape",
- "apricot",
- "cherry",
- "plum",
- "blueberry",
- "strawberry",
- "blackberry",
- }
- self.texts_by_type = (
- (set, self.texts),
- (frozenset, frozenset(self.texts)),
- (tuple, tuple(self.texts)),
- (list, list(self.texts)),
- (dict, dict.fromkeys(self.texts, None)),
- )
-
- def test_iter_sort_by_len(self):
- for try_type, typed_texts in self.texts_by_type:
- for reverse in (False, True):
- with self.subTest(
- try_type=try_type,
- typed_texts=typed_texts,
- reverse=reverse,
- ):
- result = regex_toolkit.iter_sort_by_len(
- typed_texts,
- reverse=reverse,
- )
- self.assertIsInstance(result, Generator)
- result_tuple = tuple(result)
- self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse))
- self.assertEqual(
- result_tuple,
- tuple(sorted(typed_texts, key=len, reverse=reverse)),
- )
-
- def test_sort_by_len(self):
- for try_type, typed_texts in self.texts_by_type:
- for reverse in (False, True):
- with self.subTest(
- try_type=try_type,
- typed_texts=typed_texts,
- reverse=reverse,
- ):
- result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse)
- self.assertIsInstance(result, tuple)
- self.assertTrue(is_sorted_by_len(result, reverse=reverse))
- self.assertEqual(
- result,
- tuple(sorted(typed_texts, key=len, reverse=reverse)),
- )
-
-
-class TestIterCharRange(unittest.TestCase):
- def test_iter_char_range(self):
- result = regex_toolkit.iter_char_range("a", "z")
- self.assertIsInstance(result, Generator)
- self.assertTupleEqual(
- tuple(result),
- tuple("abcdefghijklmnopqrstuvwxyz"),
- )
-
- def test_char_range(self):
- result = regex_toolkit.char_range("a", "z")
- self.assertIsInstance(result, tuple)
- self.assertTupleEqual(
- result,
- tuple("abcdefghijklmnopqrstuvwxyz"),
- )
+SORT_BY_LEN_TEXTS = [
+ "apple",
+ "orange",
+ "banana",
+ "grape",
+ "apricot",
+ "cherry",
+ "plum",
+ "blueberry",
+ "strawberry",
+ "blackberry",
+]
+SORT_BY_LEN_TEXTS_BY_TYPE = {
+ set: set(SORT_BY_LEN_TEXTS),
+ frozenset: frozenset(SORT_BY_LEN_TEXTS),
+ tuple: tuple(SORT_BY_LEN_TEXTS),
+ list: list(SORT_BY_LEN_TEXTS),
+ dict: dict.fromkeys(SORT_BY_LEN_TEXTS, None),
+}
+
+
+@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items())
+@pytest.mark.parametrize("reverse", (False, True))
+def test_iter_sort_by_len(try_type, typed_texts, reverse):
+ expected_tuple = tuple(sorted(typed_texts, key=len, reverse=reverse))
+ assert is_sorted_by_len(expected_tuple, reverse=reverse)
+
+ actual = regex_toolkit.iter_sort_by_len(typed_texts, reverse=reverse)
+ actual_tuple = tuple(actual)
+ assert isinstance(actual, Generator) and (actual_tuple == expected_tuple), {
+ "try_type": try_type,
+ "typed_texts": typed_texts,
+ "reverse": reverse,
+ "actual_tuple": actual_tuple,
+ "expected_tuple": expected_tuple,
+ }
+
+
+@pytest.mark.parametrize("try_type, typed_texts", SORT_BY_LEN_TEXTS_BY_TYPE.items())
+@pytest.mark.parametrize("reverse", (False, True))
+def test_sort_by_len(try_type, typed_texts, reverse):
+ expected = tuple(sorted(typed_texts, key=len, reverse=reverse))
+ assert is_sorted_by_len(expected, reverse=reverse)
+
+ actual = regex_toolkit.sort_by_len(typed_texts, reverse=reverse)
+ assert isinstance(actual, tuple) and (actual == expected), {
+ "try_type": try_type,
+ "typed_texts": typed_texts,
+ "reverse": reverse,
+ "actual": actual,
+ "expected": expected,
+ }
+
+
+ITER_CHAR_RANGE_CASES = [
+ # Single char
+ (("a", "a"), ("a",)),
+ # Basic range
+ (("a", "d"), ("a", "b", "c", "d")),
+ # Reverse range
+ (("d", "a"), ("d", "c", "b", "a")),
+ # Single char (non-ASCII)
+ (("πΆ", "πΊ"), ("πΆ", "π·", "πΈ", "πΉ", "πΊ")),
+]
+
+
+@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES)
+def test_char_range(char_range, expected):
+ actual = regex_toolkit.char_range(*char_range)
+ assert isinstance(actual, tuple)
+ assert actual == expected, {
+ "char_range": char_range,
+ "actual": actual,
+ "expected": expected,
+ }
+
+
+@pytest.mark.parametrize("char_range, expected", ITER_CHAR_RANGE_CASES)
+def test_iter_char_range(char_range, expected):
+ actual = regex_toolkit.iter_char_range(*char_range)
+ assert isinstance(actual, Generator)
+ actual_tuple = tuple(actual)
+ assert actual_tuple == expected, {
+ "char_range": char_range,
+ "actual_tuple": actual_tuple,
+ "expected": expected,
+ }
+
+
+@pytest.mark.parametrize(
+ "text, expected",
+ (
+ # Empty string
+ ("", ""),
+ # Already NFC
+ ("a", "a"),
+ # Already NFC (non-ASCII)
+ ("πΆπΎ", "πΆπΎ"),
+ # Basic combining char (acute accent)
+ ("a\u0301", "Γ‘"),
+ # Multiple combining chars (diaeresis and acute accent)
+ ("o\u0308\u0301", "ΓΆΜ"),
+ ),
+)
+def test_to_nfc(text, expected):
+ actual = regex_toolkit.to_nfc(text)
+ assert isinstance(actual, str)
+ assert actual == expected, {
+ "text": text,
+ "actual": actual,
+ "expected": expected,
+ }
class TestMasking(unittest.TestCase):