diff --git a/.github/.gitignore b/.github/.gitignore index 98b34b4..b9bdbd9 100644 --- a/.github/.gitignore +++ b/.github/.gitignore @@ -2,7 +2,11 @@ * # Except +!actions/ +!actions/* !ISSUE_TEMPLATE/ !ISSUE_TEMPLATE/* +!workflows/ +!workflows/* !.gitignore diff --git a/.github/ISSUE_TEMPLATE/.gitignore b/.github/ISSUE_TEMPLATE/.gitignore index eb5bc56..fc83fe4 100644 --- a/.github/ISSUE_TEMPLATE/.gitignore +++ b/.github/ISSUE_TEMPLATE/.gitignore @@ -2,8 +2,8 @@ * # Except -!bug_report.yaml -!feature_request.yaml +!bug_report.yml +!feature_request.yml !question.md !.gitignore diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yml similarity index 99% rename from .github/ISSUE_TEMPLATE/bug_report.yaml rename to .github/ISSUE_TEMPLATE/bug_report.yml index 638b689..c3decc6 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,6 +1,6 @@ name: Bug report description: Are you experiencing a problem? Create a report to help us improve! -labels: "bug" +labels: ["bug"] body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yml similarity index 92% rename from .github/ISSUE_TEMPLATE/feature_request.yaml rename to .github/ISSUE_TEMPLATE/feature_request.yml index 0fcf562..5c09dff 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,6 +1,6 @@ name: Feature Request description: Want a feature? Ask; we don't bite! -labels: 'enhancement' +labels: ["enhancement"] body: - type: markdown attributes: @@ -24,4 +24,4 @@ body: description: If so, specify - type: input attributes: - label: Additional context \ No newline at end of file + label: Additional context diff --git a/.github/actions/.gitignore b/.github/actions/.gitignore new file mode 100644 index 0000000..4769b9d --- /dev/null +++ b/.github/actions/.gitignore @@ -0,0 +1,12 @@ +# Ignore everything +* + +# Except +!build_regex_toolkit/ +!build_regex_toolkit/* +!run-tests/ +!run-tests/* +!setup-conda/ +!setup-conda/* + +!.gitignore diff --git a/.github/actions/build_regex_toolkit/.gitignore b/.github/actions/build_regex_toolkit/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/build_regex_toolkit/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/build_regex_toolkit/action.yml b/.github/actions/build_regex_toolkit/action.yml new file mode 100644 index 0000000..de017a6 --- /dev/null +++ b/.github/actions/build_regex_toolkit/action.yml @@ -0,0 +1,15 @@ +name: Build regex_toolkit +description: Build regex_toolkit +runs: + using: composite + steps: + - name: Environment Detail + run: | + micromamba info + micromamba list + shell: bash -el {0} + + - name: Build Regex-Toolkit + run: | + python -m pip install -e . --no-build-isolation --no-index + shell: bash -el {0} diff --git a/.github/actions/run-tests/.gitignore b/.github/actions/run-tests/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/run-tests/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml new file mode 100644 index 0000000..8ea25ad --- /dev/null +++ b/.github/actions/run-tests/action.yml @@ -0,0 +1,33 @@ +name: Run tests and report results +description: Run tests and report results +# inputs: +# codecov-token: +# description: Codecov token for private repo. + +runs: + using: composite + steps: + - name: Test + run: ci/run_tests.sh + shell: bash -el {0} + + - name: Publish Test Results + uses: actions/upload-artifact@v2 + with: + name: Test results + path: test-data.xml + + - name: Report Coverage + run: coverage report -m + shell: bash -el {0} + + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 + with: + # token: ${{ inputs.codecov-token }} + files: ./coverage.xml + flags: unittests + name: regex_toolkit + fail_ci_if_error: false + # env: + # CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/actions/setup-conda/.gitignore b/.github/actions/setup-conda/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/setup-conda/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml new file mode 100644 index 0000000..39cb954 --- /dev/null +++ b/.github/actions/setup-conda/action.yml @@ -0,0 +1,23 @@ +name: Set up Conda environment +description: Set up Conda environment +inputs: + environment-file: + description: Conda environment file to use. + default: environment.yml + environment-name: + description: Name to use for the Conda environment. + default: test + +runs: + using: composite + steps: + - name: Install ${{ inputs.environment-file }} + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ${{ inputs.environment-file }} + environment-name: ${{ inputs.environment-name }} + channels: conda-forge + channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }} + condarc-file: ci/condarc.yml + cache-env: true + cache-downloads: true diff --git a/.github/workflows/.gitignore b/.github/workflows/.gitignore new file mode 100644 index 0000000..dcba760 --- /dev/null +++ b/.github/workflows/.gitignore @@ -0,0 +1,9 @@ +# Ignore everything +* + +# Except +!stale-pr.yml +!ubuntu.yml +windows-macos.yml + +!.gitignore diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml new file mode 100644 index 0000000..c5d0eea --- /dev/null +++ b/.github/workflows/stale-pr.yml @@ -0,0 +1,26 @@ +name: "Stale PRs" +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: "0 0 * * *" + +permissions: + contents: read + +jobs: + stale: + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - uses: actions/stale@v4 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please update and respond to this comment if you're still interested in working on this." + stale-pr-label: "Stale" + exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" + days-before-issue-stale: -1 + days-before-pr-stale: 30 + days-before-close: -1 + remove-stale-when-updated: false + debug-only: false diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000..dd61ac3 --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,54 @@ +name: Ubuntu + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + paths-ignore: ["docs/**"] + +env: + REGEX_TOOLKIT_CI: 1 + +permissions: + contents: read + +jobs: + pytest: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + timeout-minutes: 60 + strategy: + matrix: + env_file: [actions-310.yml, actions-311.yml] + fail-fast: false + # name: ${{ matrix.name || matrix.env_file }} + name: ${{ matrix.env_file }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Regex-Toolkit + uses: ./.github/actions/build_regex_toolkit + + - name: Test + uses: ./.github/actions/run-tests + # with: + # codecov-token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index f50a89e..5013d26 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,18 @@ !setup.py !pyproject.toml !Makefile +!environment.yml +!codecov.yml +!requirements-doc.txt !src/ !src/* - !tests/ !tests/* +!ci/ +!ci/* +!docs/ +!docs/* !LICENSE !README.md diff --git a/Makefile b/Makefile index 23d0b15..8392872 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,41 @@ PYTHON=python3 +APP_NAME=regex-toolkit install: ${PYTHON} -m pip install . +install-dev: + ${PYTHON} -m pip install -e . + test: - ${PYTHON} setup.py test + @echo 'Running tests' + ${PYTHON} -m pytest tests + @echo 'Done' + +lint: + @echo 'Linting code' + ${PYTHON} -m pylint src + @echo 'Done' + +format: + @echo 'Formatting code' + ${PYTHON} -m isort src tests docs/render_readme.py + ${PYTHON} -m black src tests docs/render_readme.py + @echo 'Done' build: - ${PYTHON} setup.py build + @echo 'Building package' + ${PYTHON} -m build + @echo 'Done' publish: - ${PYTHON} setup.py publish + @echo 'Building package' + ${PYTHON} -m build + @echo 'Uploading package' + ${PYTHON} -m twine upload dist/${APP_NAME}-*.tar.gz dist/${APP_NAME}-*.whl + @echo 'Done' + +readme: + @echo 'Generating README.md' + ${PYTHON} docs/render_readme.py + @echo 'Done' diff --git a/README.md b/README.md index e546943..1be1bae 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # Regex-Toolkit -[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit): Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. +[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit) provides tools for creating [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions. + +--- ## Requirements: -**Regex-Toolkit** requires Python 3.9 or higher, is platform independent, and has no outside dependencies. +**Regex-Toolkit** requires Python 3.10 or higher, is platform independent, and has no outside dependencies. ## Issue reporting @@ -20,22 +22,39 @@ You should have received a copy of the GNU General Public License along with thi --- +[Requirements](#requirements) +[Installing](#installing) +[Usage](#usage) +[Library](#library) + ## Installing Most stable version from [**PyPi**](https://pypi.org/project/regex-toolkit/): +[![PyPI](https://img.shields.io/pypi/v/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) +[![PyPI - License](https://img.shields.io/pypi/l/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) + ```bash -python3 -m pip install regex-toolkit +$ python3 -m pip install regex-toolkit ``` Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): + +![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/Phosmic/regex-toolkit/ubuntu.yml?style=flat-square) +![Codecov](https://img.shields.io/codecov/c/github/Phosmic/regex-toolkit/main?flag=unittests&style=flat-square&token=NOT_YET_CONFIGURED) +![GitHub](https://img.shields.io/github/license/Phosmic/regex-toolkit?style=flat-square) + + ```bash -git clone git+https://github.com/Phosmic/regex-toolkit.git -cd regex-toolkit -python3 -m pip install . +$ git clone git+https://github.com/Phosmic/regex-toolkit.git +$ cd regex-toolkit +$ python3 -m pip install -e . ``` +--- + ## Usage Import packages: @@ -47,455 +66,328 @@ import re2 ``` ```python -# Can import directly if desired -import regex_toolkit as rtk +import regex_toolkit ``` --- ## Library -### iter_sort_by_len - -Function to iterate strings sorted by length. + -| Function Signature | -| :------------------------------------------------ | -| iter_sort_by_len(package_name, \*, reverse=False) | +# `regex_toolkit.utils` -| Parameters | | -| :------------------------- | :---------------------------------------------- | -| **texts**_(Iterable[str])_ | Strings to sort. | -| **reverse**_(int)_ | Sort in descending order (longest to shortest). | + -Example (ascending shortest to longest): +#### `iter_sort_by_len` ```python -words = ["longest", "short", "longer"] -for word in rtk.iter_sort_by_len(words): - print(word) +def iter_sort_by_len(texts: Iterable[str], + *, + reverse: bool = False) -> Generator[str, None, None] ``` -Output: +Iterate strings sorted by length. -```text -short -longer -longest -``` +**Arguments**: -Example reversed (descending longest to shortest): +- `texts` _Iterable[str]_ - Strings to sort. +- `reverse` _bool, optional_ - Sort in descending order (longest to shortest). Defaults to False. -```python -words = ["longest", "short", "longer"] -for word in rtk.iter_sort_by_len(words, reverse=True): - print(word) -``` +**Yields**: + +- _str_ - Strings sorted by length. + + -Output: +#### `sort_by_len` -```text -longest -longer -short +```python +def sort_by_len(texts: Iterable[str], + *, + reverse: bool = False) -> tuple[str, ...] ``` -### sort_by_len +Sort strings by length. -Function to get a tuple of strings sorted by length. +**Arguments**: -| Function Signature | -| :------------------------------------------- | -| sort_by_len(package_name, \*, reverse=False) | +- `texts` _Iterable[str]_ - Strings to sort. +- `reverse` _bool, optional_ - Sort in descending order (longest to shortest). Defaults to False. -| Parameters | | -| :------------------------- | :---------------------------------------------- | -| **texts**_(Iterable[str])_ | Strings to sort. | -| **reverse**_(int)_ | Sort in descending order (longest to shortest). | +**Returns**: -Example (ascending shortest to longest): +- _tuple[str, ...]_ - Strings sorted by length. -```python -rtk.sort_by_len(["longest", "short", "longer"]) -``` + -Result: +#### `ord_to_cpoint` ```python -('short', 'longer', 'longest') +def ord_to_cpoint(ordinal: int) -> str ``` -Example reversed (descending longest to shortest): +Character ordinal to character codepoint. -```python -rtk.sort_by_len(["longest", "short", "longer"], reverse=True) -``` +The codepoint is always 8 characters long (zero-padded). -Result: +**Example**: ```python -('longest', 'longer', 'short') +# Output: '00000061' +ord_to_cpoint(97) ``` -### ord_to_codepoint +**Arguments**: + +- `ordinal` _int_ - Character ordinal. -Function to get a character codepoint from a character ordinal. +**Returns**: -| Function Signature | -| :------------------------ | -| ord_to_codepoint(ordinal) | +- _str_ - Character codepoint. -| Parameters | | -| :----------------- | :----------------- | -| **ordinal**_(int)_ | Character ordinal. | + -Example: +#### `cpoint_to_ord` ```python -# ordinal: 127344 -ordinal = ord("πŸ…°") -rtk.ord_to_codepoint(ordinal) +def cpoint_to_ord(cpoint: str) -> int ``` -Result: +Character codepoint to character ordinal. -```python -'0001f170' -``` +**Arguments**: -### codepoint_to_ord +- `cpoint` _str_ - Character codepoint. -Function to get a character ordinal from a character codepoint. +**Returns**: -| Function Signature | -| :-------------------------- | -| codepoint_to_ord(codepoint) | +- _int_ - Character ordinal. -| Parameters | | -| :------------------- | :------------------- | -| **codepoint**_(str)_ | Character codepoint. | + -Example: +#### `char_to_cpoint` ```python -# char: "πŸ…°" -codepoint = "0001f170" -rtk.codepoint_to_ord(codepoint) +def char_to_cpoint(char: str) -> str ``` -Result: +Character to character codepoint. + +**Example**: ```python -127344 +# Output: '00000061' +char_to_cpoint("a") ``` -### char_to_codepoint - -Function to get a character codepoint from a character. +**Arguments**: -| Function Signature | -| :---------------------- | -| char_to_codepoint(char) | +- `char` _str_ - Character. -| Parameters | | -| :-------------- | :--------- | -| **char**_(str)_ | Character. | +**Returns**: -Example: +- _str_ - Character codepoint. -```python -rtk.char_to_codepoint("πŸ…°") -``` + -Result: +#### `to_nfc` ```python -'0001f170' +def to_nfc(text: str) -> str ``` -### char_as_exp +Normalize a Unicode string to NFC form C. -Function to create a **RE** expression that exactly matches a character. +Form C favors the use of a fully combined character. -| Function Signature | -| :----------------- | -| char_as_exp(char) | +**Arguments**: -| Parameters | | -| :-------------- | :------------------ | -| **char**_(str)_ | Character to match. | +- `text` _str_ - String to normalize. -Example: +**Returns**: -```python -rtk.char_as_exp("πŸ…°") -``` +- _str_ - Normalized string. + + -Result: +#### `iter_char_range` ```python -r'\πŸ…°' +def iter_char_range(first_cpoint: int, + last_cpoint: int) -> Generator[str, None, None] ``` -### char_as_exp2 +Iterate all characters within a range of codepoints (inclusive). -Function to create a **RE** expression that exactly matches a character. +**Arguments**: -| Function Signature | -| :----------------- | -| char_as_exp2(char) | +- `first_cpoint` _int_ - Starting (first) codepoint. +- `last_cpoint` _int_ - Ending (last) codepoint. -| Parameters | | -| :-------------- | :------------------ | -| **char**_(str)_ | Character to match. | +**Yields**: -Example: +- _str_ - Characters within a range of codepoints. -```python -rtk.char_as_exp2("πŸ…°") -``` + -Result: +#### `char_range` ```python -r'\x{0001f170}' +def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...] ``` -### string_as_exp +Tuple of all characters within a range of codepoints (inclusive). -Function to create a **RE** expression that exactly matches a string. +**Arguments**: -| Function Signature | -| :------------------ | -| string_as_exp(text) | +- `first_cpoint` _int_ - Starting (first) codepoint. +- `last_cpoint` _int_ - Ending (last) codepoint. -| Parameters | | -| :-------------- | :--------------- | -| **text**_(str)_ | String to match. | +**Returns**: -Example: +- _tuple[str, ...]_ - Characters within a range of codepoints. -```python -rtk.string_as_exp("πŸ…°πŸ…±πŸ…²") -``` + -Result: +#### `mask_span` ```python -r'\πŸ…°\πŸ…±\πŸ…²' +def mask_span(text: str, + span: list[int] | tuple[int, int], + mask: str | None = None) -> str ``` -### string_as_exp2 +Slice and mask a string using a single span. -Function to create a **RE** expression that exactly matches a string. +**Arguments**: -| Function Signature | -| :------------------- | -| string_as_exp2(text) | +- `text` _str_ - String to slice. +- `span` _list[int] | tuple[int, int]_ - Domain of index positions (start, end) to mask. +- `mask` _str, optional_ - Mask to insert after slicing. Defaults to None. -| Parameters | | -| :-------------- | :--------------- | -| **text**_(str)_ | String to match. | +**Returns**: -Example: +- _str_ - String with span replaced with the mask text. -```python -rtk.string_as_exp2("πŸ…°πŸ…±πŸ…²") -``` + -Result: +#### `mask_spans` ```python -r'\x{0001f170}\x{0001f171}\x{0001f172}' +def mask_spans(text: str, + spans: Iterable[list[int] | tuple[int, int]], + masks: Iterable[str] | None = None) -> str ``` -### strings_as_exp - -Function to create a **RE** expression that exactly matches any one string. - -| Function Signature | -| :-------------------- | -| strings_as_exp(texts) | +Slice and mask a string using multiple spans. -| Parameters | | -| :------------------------- | :---------------- | -| **texts**_(Iterable[str])_ | Strings to match. | +Todo: Add support for overlapping (and unordered?) spans. -Example: +**Arguments**: -```python -rtk.strings_as_exp([ - "bad.word", - "another-bad-word", -]) -``` +- `text` _str_ - String to slice. +- `spans` _Iterable[list[int] | tuple[int, int]]_ - Domains of index positions (x1, x2) to mask within the text. +- `masks` _Iterable[str], optional_ - Masks to insert when slicing. Defaults to None. -Result: +**Returns**: -```python -r'another\-bad\-word|bad\.word' -``` - -### strings_as_exp2 +- _str_ - String with all spans replaced with the mask text. -Function to create a **RE** expression that exactly matches any one string. + -| Function Signature | -| :--------------------- | -| strings_as_exp2(texts) | +# `regex_toolkit.base` -| Parameters | | -| :------------------------- | :---------------- | -| **texts**_(Iterable[str])_ | Strings to match. | + -Example: +#### `escape` ```python -rtk.strings_as_exp2([ - "bad.word", - "another-bad-word", -]) +def escape(char: str, flavor: int = 1) -> str ``` -Result: +Create a regex expression that exactly matches a character. -```python -r'another\-bad\-word|bad\.word' -``` +**Arguments**: + +- `char` _str_ - Character to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. -### iter_char_range +**Returns**: -Function to iterate all characters within a range of codepoints (inclusive). +- _str_ - Expression that exactly matches the original character. -| Function | -| :------------------------------------------------- | -| iter_char_range(first_codepoint, second_codepoint) | +**Raises**: -| Parameters | | -| :------------------------- | :-------------------------- | -| **first_codepoint**_(int)_ | Starting (first) codepoint. | -| **last_codepoint**_(int)_ | Ending (last) codepoint. | +- `ValueError` - Invalid regex flavor. -Example: + + +#### `string_as_exp` ```python -for char in rtk.iter_char_range("a", "c"): - print(char) +def string_as_exp(text: str, flavor: int = 1) -> str ``` -Output: +Create a regex expression that exactly matches a string. -```text -a -b -c -``` +**Arguments**: -### char_range +- `text` _str_ - String to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. -Function to get a tuple of all characters within a range of codepoints (inclusive). +**Returns**: -| Function | -| :-------------------------------------------- | -| char_range(first_codepoint, second_codepoint) | +- _str_ - Expression that exactly matches the original string. -| Parameters | | -| :------------------------- | :-------------------------- | -| **first_codepoint**_(int)_ | Starting (first) codepoint. | -| **last_codepoint**_(int)_ | Ending (last) codepoint. | +**Raises**: -Example: +- `ValueError` - Invalid regex flavor. -```python -rtk.char_range("a", "c") -``` + -Result: +#### `strings_as_exp` ```python -('a', 'b', 'c') +def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str ``` -### mask_span +Create a regex expression that exactly matches any one string. -Slice and mask a string using a span. +**Arguments**: -| Function Signature | -| :------------------------------- | -| mask_span(text, span, mask=None) | +- `texts` _Iterable[str]_ - Strings to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. -| Parameters | | -| :--------------------------------------- | :---------------------------------------------- | -| **text**_(str)_ | Text to slice. | -| **span**_(list[int] \| tuple[int, int])_ | Domain of index positions (start, end) to mask. | -| **mask**_(str \| None)_ | Mask to insert after slicing. | +**Returns**: -Example: +- _str_ - Expression that exactly matches any one of the original strings. -```python -rtk.mask_span( - "This is an example", - (8, 8), - mask="not ", -) -``` +**Raises**: -Result: +- `ValueError` - Invalid regex flavor. -```python -'This is not an example' -``` + -### mask_spans - -Slice and mask a string using multiple spans. +# `regex_toolkit.enums` -| Function Signature | -| :---------------------------------- | -| mask_spans(text, spans, masks=None) | +Enums. -| Parameters | | -| :-------------------------------------------------- | :--------------------------------------------------------- | -| **text**_(str)_ | Text to slice. | -| **spans**_(Iterable[list[int] \| tuple[int, int]])_ | Domains of index positions (x1, x2) to mask from the text. | -| **masks**_(Iterable[str] \| None)_ | Masks to insert when slicing. | + -Example: +## `RegexFlavor` Objects ```python -rtk.mask_spans( - "This is an example", - spans=[ - (9, 10), - (11, 18), - ], - masks=[ - " good", - "sample", - ], -) +class RegexFlavor(int, Enum) ``` -### to_utf8 +Regex flavors. -Encode a unicode string to UTF-8 form. +**Attributes**: -| Function Signature | -| :----------------- | -| to_utf8(text) | +- `RE` _int_ - Standard Python regex flavor. +- `RE2` _int_ - Google RE2 regex flavor. -| Parameters | | -| :-------------- | :-------------- | -| **text**_(str)_ | Text to encode. | -### to_nfc - -[Normalize](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize) a Unicode string to NFC form C. - -| Function Signature | -| :----------------- | -| to_utf8(text) | +--- -| Parameters | | -| :-------------- | :----------------- | -| **text**_(str)_ | Text to normalize. | diff --git a/ci/.gitignore b/ci/.gitignore new file mode 100644 index 0000000..10b884c --- /dev/null +++ b/ci/.gitignore @@ -0,0 +1,12 @@ +# Ignore everything +* + +# Except +!run_tests.sh +!pre_commit.sh +!condarc.yml + +!deps/ +!deps/* + +!.gitignore diff --git a/ci/condarc.yml b/ci/condarc.yml new file mode 100644 index 0000000..9d750b7 --- /dev/null +++ b/ci/condarc.yml @@ -0,0 +1,32 @@ +# https://docs.conda.io/projects/conda/en/latest/configuration.html + +# always_yes (NoneType, bool) +# aliases: yes +# Automatically choose the 'yes' option whenever asked to proceed with a +# conda operation, such as when running `conda install`. +# +always_yes: true + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 30.0 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 10 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 3 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff --git a/ci/deps/.gitignore b/ci/deps/.gitignore new file mode 100644 index 0000000..ae282f7 --- /dev/null +++ b/ci/deps/.gitignore @@ -0,0 +1,8 @@ +# Ignore everything +* + +# Except +!actions-310.yml +!actions-311.yml + +!.gitignore diff --git a/ci/deps/actions-310.yml b/ci/deps/actions-310.yml new file mode 100644 index 0000000..a3a6672 --- /dev/null +++ b/ci/deps/actions-310.yml @@ -0,0 +1,15 @@ +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.10 + + # Test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 + + # Required dependencies + - pip: + - google-re2>=1.0 diff --git a/ci/deps/actions-311.yml b/ci/deps/actions-311.yml new file mode 100644 index 0000000..4a16510 --- /dev/null +++ b/ci/deps/actions-311.yml @@ -0,0 +1,15 @@ +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.11 + + # Test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 + + # Required dependencies + - pip: + - google-re2>=1.0 diff --git a/ci/run_tests.sh b/ci/run_tests.sh new file mode 100755 index 0000000..bb97064 --- /dev/null +++ b/ci/run_tests.sh @@ -0,0 +1,22 @@ +#!/bin/bash -e + +# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + +# May help reproduce flaky CI builds if set in subsequent runs +echo PYTHONHASHSEED=$PYTHONHASHSEED + +# If no X server is found, we use xvfb to emulate it +if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then + export DISPLAY=":0" + XVFB="xvfb-run " +fi + +# TODO: Consider adding as an input parameter +PYTEST_TARGET=tests +PYTEST_CMD="${XVFB}pytest -r fEs -s --cov=src --cov-report=xml --cov-append $PYTEST_TARGET" + +echo $PYTEST_CMD +sh -c "$PYTEST_CMD" diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..cdb7053 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,18 @@ +codecov: + branch: main + # notify: + # after_n_builds: 10 +# comment: false + +coverage: + status: + project: + default: + target: "82" + informational: true + patch: + default: + target: "50" + informational: true +# github_checks: +# annotations: false diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..95f63f7 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,11 @@ +# Ignore everything +* + +# Except +!render_readme.py +!config.json + +!templates/ +!templates/* + +!.gitignore diff --git a/docs/config.json b/docs/config.json new file mode 100644 index 0000000..fb6d134 --- /dev/null +++ b/docs/config.json @@ -0,0 +1,26 @@ +{ + "packages_dir": "./src", + "packages": ["regex_toolkit"], + "output_file": "./README.md", + "main_template": "main.md.jinja", + "templates_dir": "./docs/templates", + "rendered_filename": "rendered_libs.md", + "template_data": { + "pypi": { + "name": "regex-toolkit", + "full_name": "Regex-Toolkit" + }, + "codecov": { + "branch": "main", + "graphing_token": "NOT_YET_CONFIGURED" + }, + "repo": { + "name": "regex-toolkit", + "full_name": "Regex-Toolkit", + "owner": "Phosmic", + "badge_style": "flat-square", + "primary_branch": "main", + "actions_template": "ubuntu.yml" + } + } +} diff --git a/docs/render_readme.py b/docs/render_readme.py new file mode 100644 index 0000000..79a3a36 --- /dev/null +++ b/docs/render_readme.py @@ -0,0 +1,147 @@ +import argparse +import json +import logging +import os +import re +from pathlib import Path + +from jinja2 import Environment, FileSystemLoader +from pydoc_markdown import PydocMarkdown +from pydoc_markdown.contrib.loaders.python import PythonLoader +from pydoc_markdown.contrib.processors.crossref import CrossrefProcessor +from pydoc_markdown.contrib.processors.filter import FilterProcessor +from pydoc_markdown.contrib.processors.smart import GoogleProcessor +from pydoc_markdown.contrib.renderers.markdown import MarkdownRenderer +from pydoc_markdown.interfaces import Context + +logger: logging.Logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def load_config(config_file: str) -> dict: + # TODO: Resolve paths here + with open(config_file, mode="r", encoding="utf-8") as file: + return json.loads(file.read()) + + +def render_library_contents( + packages_dir: str, + packages: list[str], + templates_dir: str, + rendered_filename: str, +) -> None: + """Render the Documentation for Python Modules to a File + + Args: + packages_dir (str): Base directory to search for modules. + packages (list[str]): Packages to search for modules. + templates_dir (str): Directory containing the template files. + rendered_filename (str): File to render the library contents to. + """ + output_path = os.path.join(templates_dir, rendered_filename) + session = PydocMarkdown( + loaders=[ + PythonLoader(packages=packages, encoding="utf-8"), + ], + processors=[ + FilterProcessor( + expression="not name.startswith('_') and default()", + documented_only=True, + exclude_private=True, + exclude_special=True, + do_not_filter_modules=True, + skip_empty_modules=True, + ), + GoogleProcessor(), + CrossrefProcessor(), + ], + renderer=MarkdownRenderer( + filename=output_path, + encoding="utf-8", + code_headers=True, + add_method_class_prefix=True, + add_member_class_prefix=True, + signature_code_block=True, + render_typehint_in_data_header=True, + toc_maxdepth=3, + ), + ) + context = Context(packages_dir) + session.init(context) + session.ensure_initialized() + modules = session.load_modules() + session.process(modules) + # session.run_hooks("post-render") + session.render(modules, run_hooks=True) + + # TODO: Fix these "hacks" + + # Read the original + with open(output_path, mode="r", encoding="utf-8") as file: + rendered_contents = file.read() + + # NOTE: Any types containing a "_" will be excluded from this fix + # Fix some missing highlighting in the "**Returns**" and "**Yields**" sections + rendered_contents = re.sub( + r"\*\*(Returns|Yields)\*\*:\n\n ([a-zA-Z0-9,. \|\[\]]+): ", + r"**\1**:\n\n- `\2` - ", + rendered_contents, + ) + # Change the returns and yields code blocks to italics + rendered_contents = re.sub( + r"\*\*(Returns|Yields)\*\*:\n\n- `([a-zA-Z0-9,. \|\[\]]+)` - ", + r"**\1**:\n\n- _\2_ - ", + rendered_contents, + ) + + # Fix trailing newlines with two spaces + rendered_contents = re.sub(r"\n \n", "\n\n", rendered_contents) + + # Condense consecutive newlines to two + rendered_contents = re.sub(r"\n{2,}", "\n\n", rendered_contents) + + # Write the corrected contents + with open(output_path, mode="w", encoding="utf-8") as file: + file.write(rendered_contents) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config", + type=Path, + default=Path("docs/config.json"), + help="Path to the config file.", + ) + args = parser.parse_args() + config_file_path: Path = args.config + + # Load the config + with open(config_file_path.resolve(), mode="r", encoding="utf-8") as file: + config = json.loads(file.read()) + + # Generate the library documentation + render_library_contents( + packages_dir=config["packages_dir"], + packages=config["packages"], + templates_dir=config["templates_dir"], + rendered_filename=config["rendered_filename"], + ) + + # Render the markdown readme + # TODO: Move this to a separate function? + templates_dir_path = Path(config["templates_dir"]) + loader = FileSystemLoader(templates_dir_path.resolve()) + + environment = Environment(loader=loader, auto_reload=False) + template = environment.get_template(config["main_template"]) + rendered = template.render(**config["template_data"]) + + output_file_path = Path(config["output_file"]) + with open(output_file_path.resolve(), mode="w", encoding="utf-8") as file: + file.write(rendered) + + +if __name__ == "__main__": + main() diff --git a/docs/templates/.gitignore b/docs/templates/.gitignore new file mode 100644 index 0000000..2e53636 --- /dev/null +++ b/docs/templates/.gitignore @@ -0,0 +1,14 @@ +# Ignore everything +* + +# Except +!body.md.jinja +!header.md.jinja +!footer.md.jinja +!install.md.jinja +!library.md.jinja +!requirements.md.jinja +!usage.md.jinja +!main.md.jinja + +!.gitignore diff --git a/docs/templates/body.md.jinja b/docs/templates/body.md.jinja new file mode 100644 index 0000000..10b23bc --- /dev/null +++ b/docs/templates/body.md.jinja @@ -0,0 +1,26 @@ +## Requirements: + +{% include 'requirements.md.jinja' %} + +--- + +[Requirements](#requirements) +[Installing](#installing) +[Usage](#usage) +[Library](#library) + +## Installing + +{% with pypi=pypi, repo=repo %}{% include 'install.md.jinja' %}{% endwith %} + +--- + +## Usage + +{% include 'usage.md.jinja' %} + +--- + +## Library + +{% include 'library.md.jinja' %} \ No newline at end of file diff --git a/docs/templates/footer.md.jinja b/docs/templates/footer.md.jinja new file mode 100644 index 0000000..e69de29 diff --git a/docs/templates/header.md.jinja b/docs/templates/header.md.jinja new file mode 100644 index 0000000..c4e70d9 --- /dev/null +++ b/docs/templates/header.md.jinja @@ -0,0 +1 @@ +[{{ repo.full_name }}](https://github.com/{{ repo.owner }}/{{ repo.name }}) provides tools for creating [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions. \ No newline at end of file diff --git a/docs/templates/install.md.jinja b/docs/templates/install.md.jinja new file mode 100644 index 0000000..dcc34d3 --- /dev/null +++ b/docs/templates/install.md.jinja @@ -0,0 +1,27 @@ +Most stable version from [**PyPi**](https://pypi.org/project/{{ pypi.name }}/): + +[![PyPI](https://img.shields.io/pypi/v/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) +[![PyPI - License](https://img.shields.io/pypi/l/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) + +```bash +$ python3 -m pip install {{ pypi.name }} +``` + +Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ repo.name }}): + +{% if repo.badge_style %} +![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/{{ repo.owner }}/{{ repo.name }}/{{ repo.actions_template }}?style={{ repo.badge_style }}) +![Codecov](https://img.shields.io/codecov/c/github/{{ repo.owner }}/{{ repo.name }}/{{ codecov.branch }}?flag=unittests&style={{ repo.badge_style }}&token={{ codecov.graphing_token }}) +![GitHub](https://img.shields.io/github/license/{{ repo.owner }}/{{ repo.name }}?style={{ repo.badge_style }}) +{% else %} +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/{{ repo.owner }}/{{ repo.name }}/{{ repo.actions_template }})](https://github.com/{{ repo.owner }}/{{ repo.name }}) +[![Codecov](https://img.shields.io/codecov/c/github/{{ repo.owner }}/{{ repo.name }}/{{ codecov.branch }}?flag=unittests&token={{ codecov.graphing_token }})](https://github.com/{{ repo.owner }}/{{ repo.name }}) +[![GitHub](https://img.shields.io/github/license/{{ repo.owner }}/{{ repo.name }})](https://github.com/{{ repo.owner }}/{{ repo.name }}) +{% endif %} + +```bash +$ git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git +$ cd {{ repo.name }} +$ python3 -m pip install -e . +``` \ No newline at end of file diff --git a/docs/templates/library.md.jinja b/docs/templates/library.md.jinja new file mode 100644 index 0000000..c0f8264 --- /dev/null +++ b/docs/templates/library.md.jinja @@ -0,0 +1 @@ +{% include 'rendered_libs.md' %} \ No newline at end of file diff --git a/docs/templates/main.md.jinja b/docs/templates/main.md.jinja new file mode 100644 index 0000000..3d9df33 --- /dev/null +++ b/docs/templates/main.md.jinja @@ -0,0 +1,11 @@ +# {{ repo.full_name }} + +{% include 'header.md.jinja' %} + +--- + +{% include 'body.md.jinja' %} + +--- + +{% include 'footer.md.jinja' %} \ No newline at end of file diff --git a/docs/templates/requirements.md.jinja b/docs/templates/requirements.md.jinja new file mode 100644 index 0000000..48546b8 --- /dev/null +++ b/docs/templates/requirements.md.jinja @@ -0,0 +1,13 @@ +**{{ repo.full_name }}** requires Python 3.10 or higher, is platform independent, and has no outside dependencies. + +## Issue reporting + +If you discover an issue with {{ repo.full_name }}, please report it at [https://github.com/{{ repo.owner }}/{{ repo.name }}/issues](https://github.com/{{ repo.owner }}/{{ repo.name }}/issues). + +## License + +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/. \ No newline at end of file diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja new file mode 100644 index 0000000..393c0d8 --- /dev/null +++ b/docs/templates/usage.md.jinja @@ -0,0 +1,11 @@ +Import packages: + +```python +import re +# and/or +import re2 +``` + +```python +import regex_toolkit +``` \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..367bdfc --- /dev/null +++ b/environment.yml @@ -0,0 +1,24 @@ +# Local development dependencies including docs building, website upload, ASV benchmark +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.10 + - pip + + # Test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 + - coverage + + # Required dependencies + - pip: + - google-re2>=1.0 + + # Code checks + - black=22.10.0 + - flake8=6.0.0 + - isort>=5.2.1 + - mypy=1.0 diff --git a/pyproject.toml b/pyproject.toml index 9f7a663..fe46401 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,45 +4,152 @@ build-backend = "setuptools.build_meta" [project] name = "regex_toolkit" -description = "Effortlessly craft efficient RE and RE2 expressions with user-friendly tools." +description = "Tools for creating RE and RE2 expressions" readme = "README.md" -requires-python = ">=3.9,<4" -license = {text = "GPL-3.0-or-later"} -keywords = ["re", "re2", "expression", "regex", "pattern", "tool", "toolkit"] -authors = [ - {name = "Nicholas Londowski", email = "nick@phosmic.com"}, - {name = "Francis Salerno", email = "frank@phosmic.com"}, -] +requires-python = ">=3.10,<4" +license = { file = 'LICENSE' } +keywords = ["re", "re2", "regex", "expression", "pattern", "tool", "toolkit"] +authors = [{ name = "The Phosmic Development Team", email = "dev@phosmic.com" }] classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - "Topic :: Utilities", - "Topic :: Text Processing", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Python Modules", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Utilities", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = [] +dependencies = ["google-re2>=1.0"] dynamic = ["version"] +[project.urls] +homepage = "https://pypi.org/project/regex-toolkit/" +repository = "https://github.com/Phosmic/regex-toolkit" +issues = "https://github.com/Phosmic/regex-toolkit/issues" + [tool.setuptools] packages = ["regex_toolkit"] -package-dir = {"" = "src"} +package-dir = { "" = "src" } [tool.setuptools.dynamic] -version = {attr = "regex_toolkit.__version__"} +version = { attr = "regex_toolkit.__version__" } -[project.urls] -Homepage = "https://pypi.org/project/regex-toolkit/" -Repository = "https://github.com/Phosmic/regex-toolkit" -Issues = "https://github.com/Phosmic/regex-toolkit/issues" +[tool.coverage.run] +branch = true +source = ["src"] + +[tool.coverage.paths] +source = [ + "src", + # "*/site-packages" +] + +[tool.coverage.report] +ignore_errors = false +show_missing = true +omit = [] +exclude_lines = [ + # Have to re-enable the standard pragma: + "pragma: no cover", + # Don't complain about missing debug-only code: + "def __repr__", + "if self.debug", + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + "AbstractMethodError", + # Don't complain if non-runnable code isn't run: + "if 0:", + "if False:", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pylint.messages_control] +max-line-length = 88 +disable = [] + +[tool.black] +target-version = ['py310', 'py311'] +exclude = ''' +( + asv_bench/env + | \.egg + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | setup.py +) +''' + +[tool.mypy] +python_version = "3.10" +files = ["src"] +show_error_codes = true +pretty = true +# strict = true +allow_redefinition = true +disallow_subclassing_any = false +no_implicit_optional = true +local_partial_types = true +# no_implicit_reexport = true +strict_equality = true +warn_redundant_casts = true +warn_unused_configs = true +warn_unused_ignores = true +warn_unreachable = true + +[tool.isort] +profile = "black" +src_paths = ["src", "tests"] +multi_line_output = 3 +atomic = false +include_trailing_comma = true +force_grid_wrap = 3 +use_parentheses = true +balanced_wrapping = false +ensure_newline_before_comments = true +group_by_package = true +remove_redundant_aliases = false +combine_as_imports = false +honor_case_in_force_sorted_sections = true +combine_star = false +star_first = true +sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] +# skip_glob = [] + + +[tool.pytest.ini_options] +# NOTE: Have not decided on a minimum version yet +minversion = "7.0" +addopts = "--strict-config --strict-markers --capture=no --junitxml=test-data.xml" +empty_parameter_set_mark = "fail_at_collect" +# Must fail if any test is marked as xfail but passes +xfail_strict = true +testpaths = ["tests"] +doctest_optionflags = [ + # Ignores whitespace differences, useful for spanning multiple lines + "NORMALIZE_WHITESPACE", + # Ignores differences in exception messages + "IGNORE_EXCEPTION_DETAIL", + # Allows you to use ... in place of a value in an expected output string + "ELLIPSIS", +] +filterwarnings = [] +junit_family = "xunit2" +markers = [] +# asyncio_mode = "strict" diff --git a/requirements-doc.txt b/requirements-doc.txt new file mode 100644 index 0000000..8a39850 --- /dev/null +++ b/requirements-doc.txt @@ -0,0 +1,2 @@ +novella==0.2.3 +pydoc-markdown==4.6.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 2c04005..42c2e30 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os -REQUIRED_PYTHON = (3, 9) +REQUIRED_PYTHON = (3, 10) CURRENT_PYTHON = sys.version_info[:2] if CURRENT_PYTHON < REQUIRED_PYTHON: @@ -13,7 +13,7 @@ ========================== Unsupported Python version ========================== -This version of Regex Toolkit requires at least Python {}.{}, but you're trying to install it on Python {}.{}. +This version of Regex-Toolkit requires at least Python {}.{}, but you're trying to install it on Python {}.{}. """.format( *(REQUIRED_PYTHON + CURRENT_PYTHON) ) @@ -22,38 +22,7 @@ here = os.path.abspath(os.path.dirname(__file__)) -if sys.argv[-1] == "build": - # Build - status = os.system("python3 -m build") - sys.exit(status) -elif sys.argv[-1] == "publish": - # Build and publish - status = os.system("python3 -m build") - if status == 0: - status = os.system( - " ".join( - [ - "twine upload", - os.path.join(here, "dist", "regex_toolkit-*.tar.gz"), - os.path.join(here, "dist", "regex_toolkit-*.whl"), - ] - ) - ) - sys.exit(status) -elif sys.argv[-1] == "test": - # Test - import unittest - - # Default shared TestLoader instance - test_loader = unittest.defaultTestLoader - # Basic test runner that outputs to sys.stderr - test_runner = unittest.TextTestRunner() - # Discover all tests - test_suite = test_loader.discover(os.path.join(here, "tests")) - # Run the test suite - test_runner.run(test_suite) -else: - # Legacy install - from setuptools import setup +# Legacy install +from setuptools import setup - setup() +setup() diff --git a/src/regex_toolkit/.gitignore b/src/regex_toolkit/.gitignore index 5981298..69400b7 100644 --- a/src/regex_toolkit/.gitignore +++ b/src/regex_toolkit/.gitignore @@ -4,5 +4,8 @@ # Except !__init__.py !base.py +!constants.py +!enums.py +!utils.py !.gitignore diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index 36366a9..de90ba6 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -1,3 +1,37 @@ -from .base import * +from .base import ( + escape, + string_as_exp, + strings_as_exp, +) +from .utils import ( + char_range, + char_to_cpoint, + cpoint_to_ord, + iter_char_range, + iter_sort_by_len, + mask_span, + mask_spans, + ord_to_cpoint, + sort_by_len, + to_nfc, + to_utf8, +) -__version__ = "0.0.3" +__version__ = "0.0.4" + +__all__ = [ + "escape", + "char_range", + "char_to_cpoint", + "cpoint_to_ord", + "iter_char_range", + "iter_sort_by_len", + "mask_span", + "mask_spans", + "ord_to_cpoint", + "sort_by_len", + "string_as_exp", + "strings_as_exp", + "to_nfc", + "to_utf8", +] diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 845e4bf..ca6a983 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -1,115 +1,42 @@ __all__ = [ - "iter_sort_by_len", - "sort_by_len", - "ord_to_codepoint", - "codepoint_to_ord", - "char_to_codepoint", - "char_as_exp", - "char_as_exp2", + "escape", "string_as_exp", - "string_as_exp2", "strings_as_exp", - "strings_as_exp2", - "iter_char_range", - "mask_span", - "mask_spans", - "to_utf8", - "to_nfc", ] -import string -import unicodedata - from collections.abc import Iterable -_ALPHA_CHARS: set[str] = set(string.ascii_letters) -_DIGIT_CHARTS: set[str] = set(string.digits) -_SAFE_CHARS: set[str] = _ALPHA_CHARS.union(_DIGIT_CHARTS).union(set(string.whitespace)) -_RE2_ESCAPABLE_CHARS: set[str] = set(string.punctuation) - - -def iter_sort_by_len( - texts: Iterable[str], - *, - reverse: bool = False, -) -> Iterable[str]: - """Iterate Texts Sorted by Length - - Args: - texts (Iterable[str]): Strings to sort. - reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. - - Yields: - str: Strings sorted by length. - """ - for text in sorted(texts, key=len, reverse=reverse): - yield text - - -def sort_by_len( - texts: Iterable[str], - *, - reverse: bool = False, -) -> tuple[str, ...]: - """Strings Sorted by Length - - Args: - texts (Iterable[str]): Strings to sort. - reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. - - Returns: - tuple[str]: Strings sorted by length. - """ - return tuple(iter_sort_by_len(texts, reverse=reverse)) +from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE +from regex_toolkit.enums import RegexFlavor +from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len -def ord_to_codepoint(ordinal: int) -> str: - """Character Codepoint from Character Ordinal +def escape(char: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a character. Args: - ordinal (int): Character ordinal. - - Returns: - str: Character codepoint. - """ - return format(ordinal, "x").zfill(8) - - -def codepoint_to_ord(codepoint: str) -> int: - """Character Ordinal from Character Codepoint - - Args: - codepoint (str): Character codepoint. + char (str): Character to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - int: Character ordinal. - """ - return int(codepoint, 16) - - -def char_to_codepoint(char: str) -> str: - """Character Codepoint from Character + str: Expression that exactly matches the original character. - Args: - char (str): Character. - - Returns: - str: Character codepoint. + Raises: + ValueError: Invalid regex flavor. """ - return ord_to_codepoint(ord(char)) + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") + if flavor == RegexFlavor.RE: + return _escape(char) + # elif flavor == RegexFlavor.RE2: + else: + return _escape2(char) -def char_as_exp(char: str) -> str: - """Create a RE Regex Expression that Exactly Matches a Character - - Escape to avoid reserved character classes (i.e. \s, \S, \d, \D, \1, etc.). - - Args: - char (str): Character to match. - Returns: - str: RE expression that exactly matches the original character. - """ - if char in _SAFE_CHARS: +def _escape(char: str) -> str: + if char in ALWAYS_SAFE: # Safe as-is return char else: @@ -117,176 +44,79 @@ def char_as_exp(char: str) -> str: return f"\\{char}" -def char_as_exp2(char: str) -> str: - """Create a RE2 Regex Expression that Exactly Matches a Character - - Args: - char (str): Character to match. - - Returns: - str: RE2 expression that exactly matches the original character. - """ - if char in _SAFE_CHARS: +def _escape2(char: str) -> str: + if char in ALWAYS_SAFE: # Safe as-is return char - elif char in _RE2_ESCAPABLE_CHARS: + elif char in ALWAYS_ESCAPE: # Safe to escape with backslash return f"\\{char}" else: # Otherwise escape using the codepoint - return "\\x{" + char_to_codepoint(char) + "}" + return "\\x{" + char_to_cpoint(char) + "}" -def string_as_exp(text: str) -> str: - """Create a RE Regex Expression that Exactly Matches a String +def string_as_exp(text: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a string. Args: text (str): String to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - str: RE expression that exactly matches the original string. - """ - return r"".join(map(char_as_exp, text)) - - -def string_as_exp2(text: str) -> str: - """Create a RE2 Regex Expression that Exactly Matches a String + str: Expression that exactly matches the original string. - Args: - text (str): String to match. - - Returns: - str: RE2 expression that exactly matches the original string. - """ - return r"".join(map(char_as_exp2, text)) - - -def strings_as_exp(texts: Iterable[str]) -> str: - """Create a RE Regex expression that Exactly Matches Any One String - - Args: - texts (Iterable[str]): Strings to match. - - Returns: - str: RE expression that exactly matches any one of the original strings. + Raises: + ValueError: Invalid regex flavor. """ - return r"|".join( - map( - string_as_exp, - iter_sort_by_len(texts, reverse=True), - ) - ) - - -def strings_as_exp2(texts: Iterable[str]) -> str: - """Create a RE2 Regex expression that Exactly Matches Any One String - - Args: - texts (Iterable[str]): Strings to match. + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") - Returns: - str: RE2 expression that exactly matches any one of the original strings. - """ - return r"|".join( - map( - string_as_exp2, - iter_sort_by_len(texts, reverse=True), - ) - ) + if flavor == RegexFlavor.RE: + return _string_as_exp(text) + # elif flavor == RegexFlavor.RE2: + else: + return _string_as_exp2(text) -def iter_char_range(first_codepoint: int, last_codepoint: int) -> Iterable[str]: - """Iterate All Characters within a Range of Codepoints (Inclusive) +def _string_as_exp(text: str) -> str: + return r"".join(map(_escape, text)) - Args: - first_codepoint (int): Starting (first) codepoint. - last_codepoint (int): Ending (last) codepoint. - Yields: - str: Character from within a range of codepoints. - """ - for i in range(ord(first_codepoint), ord(last_codepoint) + 1): - yield chr(i) +def _string_as_exp2(text: str) -> str: + return r"".join(map(_escape2, text)) -def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...]: - """Tuple of All Characters within a Range of Codepoints (Inclusive) +def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: + """Create a regex expression that exactly matches any one string. Args: - first_codepoint (int): Starting (first) codepoint. - last_codepoint (int): Ending (last) codepoint. + texts (Iterable[str]): Strings to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - tuple[str, ...]: Characters within a range of codepoints. - """ - return tuple(iter_char_range(first_codepoint, last_codepoint)) - - -def mask_span( - text: str, - span: list[int] | tuple[int, int], - mask: str | None = None, -) -> str: - """Slice and Mask a String using a Span - - Args: - text (str): Text to slice. - span (list[int] | tuple[int, int]): Domain of index positions (start, end) to mask. - mask (str, optional): Mask to insert after slicing. Defaults to None. + str: Expression that exactly matches any one of the original strings. - Returns: - str: Text with span replaced with the mask text. + Raises: + ValueError: Invalid regex flavor. """ - if not 0 <= span[0] <= span[1] <= len(text): - raise ValueError(f"Invalid index positions for start and end: {span}") - if mask is None: - # No mask - return text[: span[0]] + text[span[1] :] - else: - # Use mask - return text[: span[0]] + mask + text[span[1] :] - + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") -def mask_spans( - text: str, - spans: Iterable[list[int] | tuple[int, int]], - masks: Iterable[str] | None = None, -) -> str: - """Slice and Mask a String using Multiple Spans - - Args: - text (str): Text to slice. - spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask from the text. - masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. - - Returns: - str: Text with all spans replaced with the mask text. - """ - if masks is None: - # No masks - for span in reversed(spans): - text = mask_span(text, span, mask=None) + if flavor == RegexFlavor.RE: + return _strings_as_exp(texts) + # elif flavor == RegexFlavor.RE2: else: - # Has mask - for span, mask in zip(reversed(spans), reversed(masks)): - text = mask_span(text, span, mask=mask) - - return text - + return _strings_as_exp2(texts) -def to_utf8(text): - return text.encode("utf-8").decode("utf-8") +def _strings_as_exp(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) -def to_nfc(text: str) -> str: - """Normalize a Unicode String to NFC Form C - Form C favors the use of a fully combined character. - - Args: - text (str): String to normalize. - - Returns: - str: Normalized string. - """ - return unicodedata.normalize("NFC", text) +def _strings_as_exp2(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py new file mode 100644 index 0000000..0d7cc43 --- /dev/null +++ b/src/regex_toolkit/constants.py @@ -0,0 +1,28 @@ +"""Constant values. + +This module contains constant values used throughout the project. +""" + +from typing import Final + +from regex_toolkit.enums import RegexFlavor + +__all__ = [ + "ALWAYS_ESCAPE", + "ALWAYS_SAFE", + "ASCIILETTERS", + "DIGITS", +] + +DIGITS: Final[frozenset[str]] = frozenset(map(chr, b"0123456789")) +ASCIILETTERS: Final[frozenset[str]] = frozenset( + map(chr, b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") +) +ALWAYS_SAFE: Final[frozenset[str]] = DIGITS | ASCIILETTERS +ALWAYS_ESCAPE: Final[frozenset[str]] = frozenset( + map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") +) + +REGEX_FLAVORS: Final[frozenset[RegexFlavor]] = frozenset( + {RegexFlavor.RE, RegexFlavor.RE2} +) diff --git a/src/regex_toolkit/enums.py b/src/regex_toolkit/enums.py new file mode 100644 index 0000000..55a9b9d --- /dev/null +++ b/src/regex_toolkit/enums.py @@ -0,0 +1,15 @@ +"""Enums.""" + +from enum import Enum + + +class RegexFlavor(int, Enum): + """Regex flavors. + + Attributes: + RE (int): Standard Python regex flavor. + RE2 (int): Google RE2 regex flavor. + """ + + RE = 1 + RE2 = 2 diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py new file mode 100644 index 0000000..41c9df3 --- /dev/null +++ b/src/regex_toolkit/utils.py @@ -0,0 +1,199 @@ +import unicodedata +from collections.abc import Generator, Iterable + +__all__ = [ + "iter_sort_by_len", + "sort_by_len", + "ord_to_cpoint", + "cpoint_to_ord", + "char_to_cpoint", + "to_utf8", + "to_nfc", + "iter_char_range", + "char_range", + "mask_span", + "mask_spans", +] + + +def iter_sort_by_len( + texts: Iterable[str], + *, + reverse: bool = False, +) -> Generator[str, None, None]: + """Iterate strings sorted by length. + + Args: + texts (Iterable[str]): Strings to sort. + reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. + + Yields: + str: Strings sorted by length. + """ + for text in sorted(texts, key=len, reverse=reverse): + yield text + + +def sort_by_len( + texts: Iterable[str], + *, + reverse: bool = False, +) -> tuple[str, ...]: + """Sort strings by length. + + Args: + texts (Iterable[str]): Strings to sort. + reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. + + Returns: + tuple[str, ...]: Strings sorted by length. + """ + return tuple(iter_sort_by_len(texts, reverse=reverse)) + + +def ord_to_cpoint(ordinal: int) -> str: + """Character ordinal to character codepoint. + + The codepoint is always 8 characters long (zero-padded). + + Example: + + ```python + # Output: '00000061' + ord_to_cpoint(97) + ``` + + Args: + ordinal (int): Character ordinal. + + Returns: + str: Character codepoint. + """ + return format(ordinal, "x").zfill(8) + + +def cpoint_to_ord(cpoint: str) -> int: + """Character codepoint to character ordinal. + + Args: + cpoint (str): Character codepoint. + + Returns: + int: Character ordinal. + """ + return int(cpoint, 16) + + +def char_to_cpoint(char: str) -> str: + """Character to character codepoint. + + Example: + + ```python + # Output: '00000061' + char_to_cpoint("a") + ``` + + Args: + char (str): Character. + + Returns: + str: Character codepoint. + """ + return ord_to_cpoint(ord(char)) + + +def to_utf8(text): + return text.encode("utf-8").decode("utf-8") + + +def to_nfc(text: str) -> str: + """Normalize a Unicode string to NFC form C. + + Form C favors the use of a fully combined character. + + Args: + text (str): String to normalize. + + Returns: + str: Normalized string. + """ + return unicodedata.normalize("NFC", text) + + +def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]: + """Iterate all characters within a range of codepoints (inclusive). + + Args: + first_cpoint (int): Starting (first) codepoint. + last_cpoint (int): Ending (last) codepoint. + + Yields: + str: Characters within a range of codepoints. + """ + for i in range(ord(first_cpoint), ord(last_cpoint) + 1): + yield chr(i) + + +def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]: + """Tuple of all characters within a range of codepoints (inclusive). + + Args: + first_cpoint (int): Starting (first) codepoint. + last_cpoint (int): Ending (last) codepoint. + + Returns: + tuple[str, ...]: Characters within a range of codepoints. + """ + return tuple(iter_char_range(first_cpoint, last_cpoint)) + + +def mask_span( + text: str, + span: list[int] | tuple[int, int], + mask: str | None = None, +) -> str: + """Slice and mask a string using a single span. + + Args: + text (str): String to slice. + span (list[int] | tuple[int, int]): Domain of index positions (start, end) to mask. + mask (str, optional): Mask to insert after slicing. Defaults to None. + + Returns: + str: String with span replaced with the mask text. + """ + if mask is None: + # No mask + return text[: span[0]] + text[span[1] :] + else: + # Has mask + return text[: span[0]] + mask + text[span[1] :] + + +def mask_spans( + text: str, + spans: Iterable[list[int] | tuple[int, int]], + masks: Iterable[str] | None = None, +) -> str: + """Slice and mask a string using multiple spans. + + Todo: Add support for overlapping (and unordered?) spans. + + Args: + text (str): String to slice. + spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask within the text. + masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. + + Returns: + str: String with all spans replaced with the mask text. + """ + if masks is None: + # No masks + for span in reversed(spans): + text = mask_span(text, span, mask=None) + else: + # Has masks + for span, mask in zip(reversed(spans), reversed(masks)): + text = mask_span(text, span, mask=mask) + return text diff --git a/tests/.gitignore b/tests/.gitignore index be87017..9d40e1f 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,6 +2,8 @@ * # Except -!test_functions.py +!test_base.py +!test_enums.py +!test_utils.py !.gitignore diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..afc1259 --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,555 @@ +import re +import unittest +from itertools import product + +import re2 + +import regex_toolkit +from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE +from regex_toolkit.enums import RegexFlavor + + +class TestEscapeRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + + def test_safe(self): + for char in ALWAYS_SAFE: + with self.subTest(char=char): + expected_exp = char + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_escapable(self): + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_unknown(self): + # TODO: Include additional characters to test. + for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + +class TestEscapeRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + + def test_safe(self): + for char in ALWAYS_SAFE: + with self.subTest(char=char): + expected_exp = char + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_escapable(self): + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_unknown(self): + # TODO: Include additional characters to test. + # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8) + "}" + for char, expected_exp in ( + # Length 1 + ("πŸ…°", r"\x{0001f170}"), + ("πŸ…±", r"\x{0001f171}"), + ("πŸ…Ύ", r"\x{0001f17e}"), + ("πŸ…Ώ", r"\x{0001f17f}"), + ("πŸ†Ž", r"\x{0001f18e}"), + ("πŸ†‘", r"\x{0001f191}"), + ("πŸ†’", r"\x{0001f192}"), + ("πŸ†“", r"\x{0001f193}"), + ("πŸ†”", r"\x{0001f194}"), + ("πŸ†•", r"\x{0001f195}"), + ("πŸ†–", r"\x{0001f196}"), + ("πŸ†—", r"\x{0001f197}"), + ("πŸ†˜", r"\x{0001f198}"), + ("πŸ†™", r"\x{0001f199}"), + ("πŸ†š", r"\x{0001f19a}"), + ("πŸ‡¦", r"\x{0001f1e6}"), + ("πŸ‡§", r"\x{0001f1e7}"), + ("πŸ‡¨", r"\x{0001f1e8}"), + ("🈁", r"\x{0001f201}"), + ("πŸˆ‚", r"\x{0001f202}"), + ("🈚", r"\x{0001f21a}"), + ("🈯", r"\x{0001f22f}"), + ("🈲", r"\x{0001f232}"), + ("🈳", r"\x{0001f233}"), + ("🈴", r"\x{0001f234}"), + ("🈡", r"\x{0001f235}"), + ("🈢", r"\x{0001f236}"), + ("🈷", r"\x{0001f237}"), + ("🈸", r"\x{0001f238}"), + ("🈹", r"\x{0001f239}"), + ("🈺", r"\x{0001f23a}"), + ("πŸ‰", r"\x{0001f250}"), + ("πŸ‰‘", r"\x{0001f251}"), + ("πŸŒ€", r"\x{0001f300}"), + ("🌁", r"\x{0001f301}"), + ("πŸŒ‚", r"\x{0001f302}"), + ("πŸŒƒ", r"\x{0001f303}"), + ("πŸŒ„", r"\x{0001f304}"), + # Length 2 + ("πŸŒ…", r"\x{0001f305}"), + ): + with self.subTest(char=char): + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + +class TestStringAsExpressionRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + + def test_safe_individual_char(self): + # Single character. + for char in ALWAYS_SAFE: + with self.subTest(char=char): + text = char + expected_exp = char + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + # TODO: Add tests for mix of characters. + def test_safe_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_SAFE) + expected_exp = text + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_individual_char(self): + # Single character. + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + text = char + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_ESCAPE) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_unsafe_joined_as_one(self): + # All characters. + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + expected_exp = "".join(f"\\{char}" for char in text) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + +class TestStringAsExpressionRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + + # TODO: Add tests for mix of characters. + def test_safe_individual_char(self): + # Single character. + for char in ALWAYS_SAFE: + with self.subTest(char=char): + text = char + expected_exp = char + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_safe_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_SAFE) + expected_exp = "".join(ALWAYS_SAFE) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_individual_char(self): + # Single character. + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + text = char + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_ESCAPE) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_unknown_joined_as_one(self): + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + expected_exp = r"".join( + ( + r"\x{0001f170}", + r"\x{0001f171}", + r"\x{0001f17e}", + r"\x{0001f17f}", + r"\x{0001f18e}", + r"\x{0001f191}", + r"\x{0001f192}", + r"\x{0001f193}", + r"\x{0001f194}", + r"\x{0001f195}", + r"\x{0001f196}", + r"\x{0001f197}", + r"\x{0001f198}", + r"\x{0001f199}", + r"\x{0001f19a}", + r"\x{0001f1e6}", + r"\x{0001f1e7}", + r"\x{0001f1e8}", + r"\x{0001f201}", + r"\x{0001f202}", + r"\x{0001f21a}", + r"\x{0001f22f}", + r"\x{0001f232}", + r"\x{0001f233}", + r"\x{0001f234}", + r"\x{0001f235}", + r"\x{0001f236}", + r"\x{0001f237}", + r"\x{0001f238}", + r"\x{0001f239}", + r"\x{0001f23a}", + r"\x{0001f250}", + r"\x{0001f251}", + r"\x{0001f300}", + r"\x{0001f301}", + r"\x{0001f302}", + r"\x{0001f303}", + r"\x{0001f304}", + # Length 2 + r"\x{0001f305}", + ) + ) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + +RESERVED_EXPRESSIONS = frozenset( + {r"\A", r"\b", r"\B", r"\d", r"\D", r"\s", r"\S", r"\w", r"\W", r"\Z", r"\1"} +) + + +class StringsAsExpressionRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + self._max_combo_length = 2 + + def test_safe_of_various_lengths(self): + # Unique combinations of `ALWAYS_SAFE` using various lengths. + elements = ALWAYS_SAFE + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_escapable_of_various_lengths(self): + # Unique combinations of `ALWAYS_ESCAPE` using various lengths. + elements = ALWAYS_ESCAPE + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_reserved_of_various_lengths(self): + # Unique combinations of reserved expressions using various lengths. + # Exact matches that equate to reserved spaces + # E.g. Should match '\\' + 'n', not r'\n' + elements = RESERVED_EXPRESSIONS + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_unsafe_of_various_lengths(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of `ALWAYS_SAFE` using various lengths. + elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_safe_and_escapable_of_various_lengths(self): + # Unique combinations of `ALWAYS_SAFE` and `ALWAYS_ESCAPE` using various lengths. + elements = ALWAYS_SAFE | ALWAYS_ESCAPE + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join( + text if text in ALWAYS_SAFE else f"\\{text}" for text in texts + ) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + # def test_actual_examples(self): + # + + +###################### +###################### # Multiple unsafe char +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp([".", "!", "?"], self._flavor), +###################### "\\.|\\!|\\?", +###################### ) +###################### +###################### for texts, expected_exp in [ +###################### (["πŸ…°"], "\\πŸ…°"), +###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), +###################### (["alpha", "beta"], "alpha|beta"), +###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), +###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), +###################### ]: +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp(texts, self._flavor), +###################### expected_exp, +###################### ) + + +class StringsAsExpressionRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + self._max_combo_length = 2 + + def test_safe_of_variable_lengths(self): + # Unique combinations of ALWAYS_SAFE using various lengths. + elements = set(ALWAYS_SAFE) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_escapable_of_variable_lengths(self): + # Unique combinations of ALWAYS_ESCAPE using various lengths. + elements = ALWAYS_ESCAPE + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_reserved_of_variable_lengths(self): + # Unique combinations of reserved expressions using various lengths. + # Exact matches that equate to reserved spaces + # E.g. Should match '\\' + 'n', not r'\n' + elements = RESERVED_EXPRESSIONS + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + def test_unsafe_of_variable_lengths(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of ALWAYS_SAFE using various lengths. + elements_map = { + # Length 1 + "πŸ…°": r"\x{0001f170}", + "πŸ…±": r"\x{0001f171}", + "πŸ…Ύ": r"\x{0001f17e}", + "πŸ…Ώ": r"\x{0001f17f}", + "πŸ†Ž": r"\x{0001f18e}", + "πŸ†‘": r"\x{0001f191}", + "πŸ†’": r"\x{0001f192}", + "πŸ†“": r"\x{0001f193}", + "πŸ†”": r"\x{0001f194}", + "πŸ†•": r"\x{0001f195}", + "πŸ†–": r"\x{0001f196}", + "πŸ†—": r"\x{0001f197}", + "πŸ†˜": r"\x{0001f198}", + "πŸ†™": r"\x{0001f199}", + "πŸ†š": r"\x{0001f19a}", + "πŸ‡¦": r"\x{0001f1e6}", + "πŸ‡§": r"\x{0001f1e7}", + "πŸ‡¨": r"\x{0001f1e8}", + "🈁": r"\x{0001f201}", + "πŸˆ‚": r"\x{0001f202}", + "🈚": r"\x{0001f21a}", + "🈯": r"\x{0001f22f}", + "🈲": r"\x{0001f232}", + "🈳": r"\x{0001f233}", + "🈴": r"\x{0001f234}", + "🈡": r"\x{0001f235}", + "🈢": r"\x{0001f236}", + "🈷": r"\x{0001f237}", + "🈸": r"\x{0001f238}", + "🈹": r"\x{0001f239}", + "🈺": r"\x{0001f23a}", + "πŸ‰": r"\x{0001f250}", + "πŸ‰‘": r"\x{0001f251}", + "πŸŒ€": r"\x{0001f300}", + "🌁": r"\x{0001f301}", + "πŸŒ‚": r"\x{0001f302}", + "πŸŒƒ": r"\x{0001f303}", + "πŸŒ„": r"\x{0001f304}", + # Length 2 + "πŸŒ…": r"\x{0001f305}", + } + elements = tuple(elements_map) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(elements_map[text] for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + +##############################3 # Exact matches that equate to reserved spaces +##############################3 # E.g. Should match '\\' + 'n', not r'\n' +##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): +##############################3 texts = [text] +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 f"\\{text}", +##############################3 ) +##############################3 +##############################3 # Single whitespace char +##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), texts[0]) +##############################3 +##############################3 # Single unsafe char +##############################3 for texts, expected_exp in [ +##############################3 (["."], "\\."), +##############################3 (["!"], "\\!"), +##############################3 (["?"], "\\?"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 expected_exp, +##############################3 ) +##############################3 +##############################3 # Multiple unsafe char +##############################3 texts = [".", "!", "?"] +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), "\\.|\\!|\\?") +##############################3 +##############################3 for texts, expected_exp in [ +##############################3 (["πŸ…°"], "\\x{0001f170}"), +##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), +##############################3 (["alpha", "beta"], "alpha|beta"), +##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), +##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 expected_exp, +##############################3 ) + +# TODO: Add tests for actually compiling the e. diff --git a/tests/test_enums.py b/tests/test_enums.py new file mode 100644 index 0000000..3af0dbd --- /dev/null +++ b/tests/test_enums.py @@ -0,0 +1,20 @@ +import pytest + +from regex_toolkit.enums import RegexFlavor + + +def test_regex_flavor_enum_is_int(): + assert isinstance(RegexFlavor.RE, int) + assert RegexFlavor.RE == 1 + assert RegexFlavor(1) == RegexFlavor.RE + assert isinstance(RegexFlavor.RE2, int) + assert RegexFlavor.RE2 == 2 + assert RegexFlavor(2) == RegexFlavor.RE2 + + +def test_invalid_regex_flavor_raises_value_error(): + with pytest.raises(ValueError): + RegexFlavor(0) + + with pytest.raises(ValueError): + RegexFlavor(3) diff --git a/tests/test_functions.py b/tests/test_functions.py deleted file mode 100644 index a03953b..0000000 --- a/tests/test_functions.py +++ /dev/null @@ -1,318 +0,0 @@ -import unittest -import regex_toolkit - -from collections.abc import Iterable -from itertools import product - - -class TestStringMethods(unittest.TestCase): - def test_iter_sort_by_len(self): - # Words used during test - texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - dict: dict.fromkeys(texts, None), - }.items(): - # Not reversed (shortest to longest) - result = regex_toolkit.iter_sort_by_len(texts_as_try_type, reverse=False) - - # Returns a iterable (allows for duplicate entries) - self.assertIsInstance(result, Iterable) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(tuple(result))) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertGreaterEqual(len(text), prev_len) - - prev_len = len(text) - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - }.items(): - # Not reversed (longest to shortest) - result = regex_toolkit.iter_sort_by_len(texts_as_try_type, reverse=True) - - # Returns a iterable (allows for duplicate entries) - self.assertIsInstance(result, Iterable) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(tuple(result))) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertLessEqual(len(text), prev_len) - - prev_len = len(text) - - def test_sort_by_len(self): - # Words used during test - texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - dict: dict.fromkeys(texts, None), - }.items(): - # Not reversed (shortest to longest) - result = regex_toolkit.sort_by_len(texts_as_try_type, reverse=False) - - # Returns a tuple (allows for duplicate entries) - self.assertIsInstance(result, tuple) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(result)) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertGreaterEqual(len(text), prev_len) - - prev_len = len(text) - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - }.items(): - # Not reversed (longest to shortest) - result = regex_toolkit.sort_by_len(texts_as_try_type, reverse=True) - - # Returns a tuple (allows for duplicate entries) - self.assertIsInstance(result, tuple) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(result)) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertLessEqual(len(text), prev_len) - - prev_len = len(text) - - def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit._safe_chars) - actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit._safe_chars) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit._escapable_chars) - actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit._escapable_chars) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit._safe_chars) - actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit._safe_chars) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit._escapable_chars) - actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit._escapable_chars) - self.assertEqual(actual_exp, expected_exp) - - def test_iter_char_range(self): - result = regex_toolkit.iter_char_range("a", "z") - - # Returns a iterable (no duplicate entries) - self.assertIsInstance(result, Iterable) - - # Validate output - actual_char_range = tuple(result) - excpected_char_range = tuple("abcdefghijklmnopqrstuvwxyz") - self.assertEqual(actual_char_range, excpected_char_range) - - def test_char_range(self): - result = regex_toolkit.char_range("a", "z") - - # Returns a tuple (no duplicate entries) - self.assertIsInstance(result, tuple) - - # Validate output - actual_char_range = result - excpected_char_range = tuple("abcdefghijklmnopqrstuvwxyz") - self.assertEqual(actual_char_range, excpected_char_range) - - def test_mask_span(self): - text = "This is an example" - - # Run test using different acceptable sequence types - indexes = (8, 8) - for try_type, indexes_as_try_type in { - tuple: indexes, - list: list(indexes), - }.items(): - actual_text = regex_toolkit.mask_span(text, indexes_as_try_type, "not ") - expected_text = "This is not an example" - self.assertEqual(actual_text, expected_text) - - # Run test using different acceptable sequence types - indexes = (5, 7) - for try_type, indexes_as_try_type in { - tuple: indexes, - list: list(indexes), - }.items(): - actual_text = regex_toolkit.mask_span(text, indexes_as_try_type, "isn't") - expected_text = "This isn't an example" - self.assertEqual(actual_text, expected_text) - - def test_char_as_exp(self): - for char, expected_exp in ( - ("s", "s"), - ("d", "d"), - ("\n", "\n"), - (".", "\\."), - ("!", "\\!"), - ("?", "\\?"), - ("πŸ…°", "\\πŸ…°"), - ): - actual_exp = regex_toolkit.char_as_exp(char) - self.assertEqual(actual_exp, expected_exp) - - def test_char_as_exp2(self): - for char, expected_exp in ( - ("s", "s"), - ("d", "d"), - ("\n", "\n"), - (".", "\\."), - ("!", "\\!"), - ("?", "\\?"), - ("πŸ…°", r"\x{0001f170}"), - ): - actual_exp = regex_toolkit.char_as_exp2(char) - self.assertEqual(actual_exp, expected_exp) - - def test_strings_as_exp(self): - # Alphanumeric single char and multi-char combos - for i in range(4): - for char_tuple in product(i * ["a", "b", "0", "1"]): - actual_exp = regex_toolkit.strings_as_exp(char_tuple) - expected_exp = "|".join(char_tuple) - self.assertEqual(actual_exp, expected_exp) - - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'd', not r'\d' - for text in {r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"}: - actual_exp = regex_toolkit.strings_as_exp([text]) - expected_exp = f"\\{text}" - self.assertEqual(actual_exp, expected_exp) - - # Single whitespace char - for text in {"\n", "\v", "\t", "\r", "\f", "\v"}: - actual_exp = regex_toolkit.strings_as_exp([text]) - expected_exp = text - self.assertEqual(actual_exp, expected_exp) - - # Single unsafe char - for texts, expected_exp in [ - (["."], "\\."), - (["!"], "\\!"), - (["?"], "\\?"), - ]: - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - # Multiple unsafe char - texts = [".", "!", "?"] - expected_exp = "\\.|\\!|\\?" - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - for texts, expected_exp in [ - (["πŸ…°"], "\\πŸ…°"), - (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), - (["alpha", "beta"], "alpha|beta"), - (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), - (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), - ]: - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - def test_strings_as_exp2(self): - # Alphanumeric single char and multi-char combos - for i in range(4): - for char_tuple in product(i * ["a", "b", "0", "1"]): - actual_exp = regex_toolkit.strings_as_exp2(char_tuple) - expected_exp = "|".join(char_tuple) - self.assertEqual(actual_exp, expected_exp) - - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'd', not r'\d' - for text in {r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"}: - actual_exp = regex_toolkit.strings_as_exp2([text]) - expected_exp = f"\\{text}" - self.assertEqual(actual_exp, expected_exp) - - # Single whitespace char - for text in {"\n", "\v", "\t", "\r", "\f", "\v"}: - actual_exp = regex_toolkit.strings_as_exp2([text]) - expected_exp = text - self.assertEqual(actual_exp, expected_exp) - - # Single unsafe char - for texts, expected_exp in [ - (["."], "\\."), - (["!"], "\\!"), - (["?"], "\\?"), - ]: - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) - - # Multiple unsafe char - texts = [".", "!", "?"] - expected_exp = "\\.|\\!|\\?" - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) - - for texts, expected_exp in [ - (["πŸ…°"], "\\x{0001f170}"), - (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), - (["alpha", "beta"], "alpha|beta"), - (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), - (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), - ]: - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..c1fccdf --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,119 @@ +import unittest +from collections.abc import Generator, Iterable + +import regex_toolkit + + +def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: + prev_len = None + for text in texts: + if prev_len is None: + prev_len = len(text) + if reverse: + if len(text) > prev_len: + return False + else: + if len(text) < prev_len: + return False + prev_len = len(text) + return True + + +class TestSortByLength(unittest.TestCase): + def setUp(self) -> None: + self.texts = { + "apple", + "orange", + "banana", + "grape", + "apricot", + "cherry", + "plum", + "blueberry", + "strawberry", + "blackberry", + } + self.texts_by_type = ( + (set, self.texts), + (frozenset, frozenset(self.texts)), + (tuple, tuple(self.texts)), + (list, list(self.texts)), + (dict, dict.fromkeys(self.texts, None)), + ) + + def test_iter_sort_by_len(self): + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, + typed_texts=typed_texts, + reverse=reverse, + ): + result = regex_toolkit.iter_sort_by_len( + typed_texts, + reverse=reverse, + ) + self.assertIsInstance(result, Generator) + result_tuple = tuple(result) + self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) + self.assertEqual( + result_tuple, + tuple(sorted(typed_texts, key=len, reverse=reverse)), + ) + + def test_sort_by_len(self): + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, + typed_texts=typed_texts, + reverse=reverse, + ): + result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) + self.assertIsInstance(result, tuple) + self.assertTrue(is_sorted_by_len(result, reverse=reverse)) + self.assertEqual( + result, + tuple(sorted(typed_texts, key=len, reverse=reverse)), + ) + + +class TestIterCharRange(unittest.TestCase): + def test_iter_char_range(self): + result = regex_toolkit.iter_char_range("a", "z") + self.assertIsInstance(result, Generator) + self.assertTupleEqual( + tuple(result), + tuple("abcdefghijklmnopqrstuvwxyz"), + ) + + def test_char_range(self): + result = regex_toolkit.char_range("a", "z") + self.assertIsInstance(result, tuple) + self.assertTupleEqual( + result, + tuple("abcdefghijklmnopqrstuvwxyz"), + ) + + +class TestMasking(unittest.TestCase): + def setUp(self): + self.text = "This is an example" + + def test_insert_word(self): + indexes = (8, 8) + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "not "), + "This is not an example", + ) + + def test_replace_word(self): + indexes = (5, 7) + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "isn't"), + "This isn't an example", + )