From 2468af674698c7e5be8b2eb336c4bbbd01d41fb6 Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 9 Mar 2023 21:06:42 -0600 Subject: [PATCH 1/7] Update makefile, setup.py, and pyproject.toml. --- Makefile | 10 ++- pyproject.toml | 169 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 145 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 23d0b15..46d3437 100644 --- a/Makefile +++ b/Makefile @@ -3,11 +3,15 @@ PYTHON=python3 install: ${PYTHON} -m pip install . +install-dev: + ${PYTHON} -m pip install -e . + test: - ${PYTHON} setup.py test + ${PYTHON} -m pytest tests build: - ${PYTHON} setup.py build + ${PYTHON} -m build publish: - ${PYTHON} setup.py publish + ${PYTHON} -m build + twine upload dist/regex_toolkit-*.tar.gz dist/regex_toolkit-*.whl diff --git a/pyproject.toml b/pyproject.toml index 9f7a663..88635b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,45 +4,152 @@ build-backend = "setuptools.build_meta" [project] name = "regex_toolkit" -description = "Effortlessly craft efficient RE and RE2 expressions with user-friendly tools." +description = "Effortlessly craft efficient RE and RE2 expressions with user-friendly tools" readme = "README.md" -requires-python = ">=3.9,<4" -license = {text = "GPL-3.0-or-later"} +requires-python = ">=3.10,<4" +license = { file = 'LICENSE' } keywords = ["re", "re2", "expression", "regex", "pattern", "tool", "toolkit"] -authors = [ - {name = "Nicholas Londowski", email = "nick@phosmic.com"}, - {name = "Francis Salerno", email = "frank@phosmic.com"}, -] +authors = [{ name = "The Phosmic Development Team", email = "dev@phosmic.com" }] classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - "Topic :: Utilities", - "Topic :: Text Processing", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Python Modules", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Utilities", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = [] +dependencies = ["google-re2>=0.2.20210901", "lxml>=4.6.3"] dynamic = ["version"] +[project.urls] +homepage = "https://pypi.org/project/regex-toolkit/" +repository = "https://github.com/Phosmic/regex-toolkit" +issues = "https://github.com/Phosmic/regex-toolkit/issues" + [tool.setuptools] packages = ["regex_toolkit"] -package-dir = {"" = "src"} +package-dir = { "" = "src" } [tool.setuptools.dynamic] -version = {attr = "regex_toolkit.__version__"} +version = { attr = "regex_toolkit.__version__" } -[project.urls] -Homepage = "https://pypi.org/project/regex-toolkit/" -Repository = "https://github.com/Phosmic/regex-toolkit" -Issues = "https://github.com/Phosmic/regex-toolkit/issues" +[tool.coverage.run] +branch = true +source = ["src"] + +[tool.coverage.paths] +source = [ + "src", + # "*/site-packages" +] + +[tool.coverage.report] +ignore_errors = false +show_missing = true +omit = [] +exclude_lines = [ + # Have to re-enable the standard pragma: + "pragma: no cover", + # Don't complain about missing debug-only code: + "def __repr__", + "if self.debug", + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + "AbstractMethodError", + # Don't complain if non-runnable code isn't run: + "if 0:", + "if False:", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pylint.messages_control] +max-line-length = 88 +disable = [] + +[tool.black] +target-version = ['py310', 'py311'] +exclude = ''' +( + asv_bench/env + | \.egg + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | setup.py +) +''' + +[tool.mypy] +python_version = "3.10" +files = ["src"] +show_error_codes = true +pretty = true +# strict = true +allow_redefinition = true +disallow_subclassing_any = false +no_implicit_optional = true +local_partial_types = true +# no_implicit_reexport = true +strict_equality = true +warn_redundant_casts = true +warn_unused_configs = true +warn_unused_ignores = true +warn_unreachable = true + +[tool.isort] +profile = "black" +src_paths = ["src", "tests"] +multi_line_output = 3 +atomic = false +include_trailing_comma = true +force_grid_wrap = 3 +use_parentheses = true +balanced_wrapping = false +ensure_newline_before_comments = true +group_by_package = true +remove_redundant_aliases = false +combine_as_imports = false +honor_case_in_force_sorted_sections = true +combine_star = false +star_first = true +sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] +# skip_glob = [] + + +[tool.pytest.ini_options] +# NOTE: Have not decided on a minimum version yet +minversion = "7.0" +addopts = "--strict-config --strict-markers --capture=no --junitxml=test-data.xml" +empty_parameter_set_mark = "fail_at_collect" +# Must fail if any test is marked as xfail but passes +xfail_strict = true +testpaths = ["tests"] +doctest_optionflags = [ + # Ignores whitespace differences, useful for spanning multiple lines + "NORMALIZE_WHITESPACE", + # Ignores differences in exception messages + "IGNORE_EXCEPTION_DETAIL", + # Allows you to use ... in place of a value in an expected output string + "ELLIPSIS", +] +filterwarnings = [] +junit_family = "xunit2" +markers = [] +# asyncio_mode = "strict" From af8ad81897afd79bd7d1319f115f863da9a6056b Mon Sep 17 00:00:00 2001 From: yaphott Date: Thu, 9 Mar 2023 21:42:13 -0600 Subject: [PATCH 2/7] Updated tests, add CI, doc generation, and reformatted docstrings. --- .github/.gitignore | 4 + .github/ISSUE_TEMPLATE/.gitignore | 4 +- .../{bug_report.yaml => bug_report.yml} | 2 +- ...ature_request.yaml => feature_request.yml} | 4 +- .github/actions/.gitignore | 12 + .../actions/build_regex_toolkit/.gitignore | 7 + .../actions/build_regex_toolkit/action.yml | 15 + .github/actions/run-tests/.gitignore | 7 + .github/actions/run-tests/action.yml | 31 + .github/actions/setup-conda/.gitignore | 7 + .github/actions/setup-conda/action.yml | 23 + .github/workflows/.gitignore | 9 + .github/workflows/stale-pr.yml | 26 + .github/workflows/ubuntu.yml | 54 ++ .gitignore | 8 +- README.md | 638 +++++++++++------- ci/.gitignore | 12 + ci/condarc.yml | 32 + ci/deps/.gitignore | 8 + ci/deps/actions-310.yml | 11 + ci/deps/actions-311.yml | 11 + ci/run_tests.sh | 22 + docs/.gitignore | 11 + docs/config.json | 22 + docs/render_readme.py | 132 ++++ docs/templates/.gitignore | 14 + docs/templates/body.md.jinja | 26 + docs/templates/footer.md.jinja | 0 docs/templates/header.md.jinja | 1 + docs/templates/install.md.jinja | 27 + docs/templates/library.md.jinja | 1 + docs/templates/main.md.jinja | 11 + docs/templates/requirements.md.jinja | 13 + docs/templates/usage.md.jinja | 11 + environment.yml | 20 + pyproject.toml | 2 +- requirements-doc.txt | 2 + setup.py | 41 +- src/regex_toolkit/.gitignore | 1 + src/regex_toolkit/__init__.py | 39 +- src/regex_toolkit/base.py | 74 +- src/regex_toolkit/constants.py | 21 + tests/test_functions.py | 20 +- 43 files changed, 1082 insertions(+), 354 deletions(-) rename .github/ISSUE_TEMPLATE/{bug_report.yaml => bug_report.yml} (99%) rename .github/ISSUE_TEMPLATE/{feature_request.yaml => feature_request.yml} (92%) create mode 100644 .github/actions/.gitignore create mode 100644 .github/actions/build_regex_toolkit/.gitignore create mode 100644 .github/actions/build_regex_toolkit/action.yml create mode 100644 .github/actions/run-tests/.gitignore create mode 100644 .github/actions/run-tests/action.yml create mode 100644 .github/actions/setup-conda/.gitignore create mode 100644 .github/actions/setup-conda/action.yml create mode 100644 .github/workflows/.gitignore create mode 100644 .github/workflows/stale-pr.yml create mode 100644 .github/workflows/ubuntu.yml create mode 100644 ci/.gitignore create mode 100644 ci/condarc.yml create mode 100644 ci/deps/.gitignore create mode 100644 ci/deps/actions-310.yml create mode 100644 ci/deps/actions-311.yml create mode 100755 ci/run_tests.sh create mode 100644 docs/.gitignore create mode 100644 docs/config.json create mode 100644 docs/render_readme.py create mode 100644 docs/templates/.gitignore create mode 100644 docs/templates/body.md.jinja create mode 100644 docs/templates/footer.md.jinja create mode 100644 docs/templates/header.md.jinja create mode 100644 docs/templates/install.md.jinja create mode 100644 docs/templates/library.md.jinja create mode 100644 docs/templates/main.md.jinja create mode 100644 docs/templates/requirements.md.jinja create mode 100644 docs/templates/usage.md.jinja create mode 100644 environment.yml create mode 100644 requirements-doc.txt create mode 100644 src/regex_toolkit/constants.py diff --git a/.github/.gitignore b/.github/.gitignore index 98b34b4..b9bdbd9 100644 --- a/.github/.gitignore +++ b/.github/.gitignore @@ -2,7 +2,11 @@ * # Except +!actions/ +!actions/* !ISSUE_TEMPLATE/ !ISSUE_TEMPLATE/* +!workflows/ +!workflows/* !.gitignore diff --git a/.github/ISSUE_TEMPLATE/.gitignore b/.github/ISSUE_TEMPLATE/.gitignore index eb5bc56..fc83fe4 100644 --- a/.github/ISSUE_TEMPLATE/.gitignore +++ b/.github/ISSUE_TEMPLATE/.gitignore @@ -2,8 +2,8 @@ * # Except -!bug_report.yaml -!feature_request.yaml +!bug_report.yml +!feature_request.yml !question.md !.gitignore diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yml similarity index 99% rename from .github/ISSUE_TEMPLATE/bug_report.yaml rename to .github/ISSUE_TEMPLATE/bug_report.yml index 638b689..c3decc6 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,6 +1,6 @@ name: Bug report description: Are you experiencing a problem? Create a report to help us improve! -labels: "bug" +labels: ["bug"] body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yml similarity index 92% rename from .github/ISSUE_TEMPLATE/feature_request.yaml rename to .github/ISSUE_TEMPLATE/feature_request.yml index 0fcf562..5c09dff 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,6 +1,6 @@ name: Feature Request description: Want a feature? Ask; we don't bite! -labels: 'enhancement' +labels: ["enhancement"] body: - type: markdown attributes: @@ -24,4 +24,4 @@ body: description: If so, specify - type: input attributes: - label: Additional context \ No newline at end of file + label: Additional context diff --git a/.github/actions/.gitignore b/.github/actions/.gitignore new file mode 100644 index 0000000..4769b9d --- /dev/null +++ b/.github/actions/.gitignore @@ -0,0 +1,12 @@ +# Ignore everything +* + +# Except +!build_regex_toolkit/ +!build_regex_toolkit/* +!run-tests/ +!run-tests/* +!setup-conda/ +!setup-conda/* + +!.gitignore diff --git a/.github/actions/build_regex_toolkit/.gitignore b/.github/actions/build_regex_toolkit/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/build_regex_toolkit/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/build_regex_toolkit/action.yml b/.github/actions/build_regex_toolkit/action.yml new file mode 100644 index 0000000..de017a6 --- /dev/null +++ b/.github/actions/build_regex_toolkit/action.yml @@ -0,0 +1,15 @@ +name: Build regex_toolkit +description: Build regex_toolkit +runs: + using: composite + steps: + - name: Environment Detail + run: | + micromamba info + micromamba list + shell: bash -el {0} + + - name: Build Regex-Toolkit + run: | + python -m pip install -e . --no-build-isolation --no-index + shell: bash -el {0} diff --git a/.github/actions/run-tests/.gitignore b/.github/actions/run-tests/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/run-tests/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml new file mode 100644 index 0000000..c42229c --- /dev/null +++ b/.github/actions/run-tests/action.yml @@ -0,0 +1,31 @@ +name: Run tests and report results +description: Run tests and report results +# inputs: +# codecov-token: +# description: Codecov token for private repo. + +runs: + using: composite + steps: + - name: Test + run: ci/run_tests.sh + shell: bash -el {0} + + - name: Publish Test Results + uses: actions/upload-artifact@v2 + with: + name: Test results + path: test-data.xml + + - name: Report Coverage + run: coverage report -m + shell: bash -el {0} + + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 + with: + # token: ${{ inputs.codecov-token }} + files: ./coverage.xml + flags: unittests + name: regex_toolkit + fail_ci_if_error: false diff --git a/.github/actions/setup-conda/.gitignore b/.github/actions/setup-conda/.gitignore new file mode 100644 index 0000000..34b5ce3 --- /dev/null +++ b/.github/actions/setup-conda/.gitignore @@ -0,0 +1,7 @@ +# Ignore everything +* + +# Except +!action.yml + +!.gitignore diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml new file mode 100644 index 0000000..39cb954 --- /dev/null +++ b/.github/actions/setup-conda/action.yml @@ -0,0 +1,23 @@ +name: Set up Conda environment +description: Set up Conda environment +inputs: + environment-file: + description: Conda environment file to use. + default: environment.yml + environment-name: + description: Name to use for the Conda environment. + default: test + +runs: + using: composite + steps: + - name: Install ${{ inputs.environment-file }} + uses: mamba-org/provision-with-micromamba@v12 + with: + environment-file: ${{ inputs.environment-file }} + environment-name: ${{ inputs.environment-name }} + channels: conda-forge + channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }} + condarc-file: ci/condarc.yml + cache-env: true + cache-downloads: true diff --git a/.github/workflows/.gitignore b/.github/workflows/.gitignore new file mode 100644 index 0000000..dcba760 --- /dev/null +++ b/.github/workflows/.gitignore @@ -0,0 +1,9 @@ +# Ignore everything +* + +# Except +!stale-pr.yml +!ubuntu.yml +windows-macos.yml + +!.gitignore diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml new file mode 100644 index 0000000..c5d0eea --- /dev/null +++ b/.github/workflows/stale-pr.yml @@ -0,0 +1,26 @@ +name: "Stale PRs" +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: "0 0 * * *" + +permissions: + contents: read + +jobs: + stale: + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - uses: actions/stale@v4 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please update and respond to this comment if you're still interested in working on this." + stale-pr-label: "Stale" + exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" + days-before-issue-stale: -1 + days-before-pr-stale: 30 + days-before-close: -1 + remove-stale-when-updated: false + debug-only: false diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000..dd61ac3 --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,54 @@ +name: Ubuntu + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + paths-ignore: ["docs/**"] + +env: + REGEX_TOOLKIT_CI: 1 + +permissions: + contents: read + +jobs: + pytest: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -el {0} + timeout-minutes: 60 + strategy: + matrix: + env_file: [actions-310.yml, actions-311.yml] + fail-fast: false + # name: ${{ matrix.name || matrix.env_file }} + name: ${{ matrix.env_file }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Regex-Toolkit + uses: ./.github/actions/build_regex_toolkit + + - name: Test + uses: ./.github/actions/run-tests + # with: + # codecov-token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index f50a89e..5013d26 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,18 @@ !setup.py !pyproject.toml !Makefile +!environment.yml +!codecov.yml +!requirements-doc.txt !src/ !src/* - !tests/ !tests/* +!ci/ +!ci/* +!docs/ +!docs/* !LICENSE !README.md diff --git a/README.md b/README.md index e546943..ab34358 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # Regex-Toolkit -[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit): Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. +[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit) Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. + +--- ## Requirements: -**Regex-Toolkit** requires Python 3.9 or higher, is platform independent, and has no outside dependencies. +**Regex-Toolkit** requires Python 3.10 or higher, is platform independent, and has no outside dependencies. ## Issue reporting @@ -20,22 +22,39 @@ You should have received a copy of the GNU General Public License along with thi --- +[Requirements](#requirements) +[Installing](#installing) +[Usage](#usage) +[Library](#library) + ## Installing Most stable version from [**PyPi**](https://pypi.org/project/regex-toolkit/): +[![PyPI](https://img.shields.io/pypi/v/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) +[![PyPI - License](https://img.shields.io/pypi/l/regex-toolkit?style=flat-square)](https://pypi.org/project/regex-toolkit/) + ```bash python3 -m pip install regex-toolkit ``` Development version from [**GitHub**](https://github.com/Phosmic/regex-toolkit): + +![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/Phosmic/regex-toolkit/ubuntu.yml?style=flat-square) +![Codecov](https://img.shields.io/codecov/c/github/Phosmic/regex-toolkit/master?flag=unittests&style=flat-square&token=XMJZIW8ZL3) +![GitHub](https://img.shields.io/github/license/Phosmic/regex-toolkit?style=flat-square) + + ```bash git clone git+https://github.com/Phosmic/regex-toolkit.git cd regex-toolkit -python3 -m pip install . +python3 -m pip install -e . ``` +--- + ## Usage Import packages: @@ -47,455 +66,552 @@ import re2 ``` ```python -# Can import directly if desired -import regex_toolkit as rtk +import regex_toolkit ``` --- ## Library -### iter_sort_by_len - -Function to iterate strings sorted by length. + -| Function Signature | -| :------------------------------------------------ | -| iter_sort_by_len(package_name, \*, reverse=False) | +# `regex_toolkit.base` -| Parameters | | -| :------------------------- | :---------------------------------------------- | -| **texts**_(Iterable[str])_ | Strings to sort. | -| **reverse**_(int)_ | Sort in descending order (longest to shortest). | + -Example (ascending shortest to longest): +#### `iter_sort_by_len` ```python -words = ["longest", "short", "longer"] -for word in rtk.iter_sort_by_len(words): - print(word) +def iter_sort_by_len(texts: Iterable[str], + *, + reverse: bool = False) -> Iterable[str] ``` -Output: +Iterate strings sorted by length. -```text -short -longer -longest -``` +**Arguments**: + +- `texts` _Iterable[str]_ - Strings to sort. +- `reverse` _bool, optional_ - Sort in descending order (longest to shortest). Defaults to False. + +**Yields**: + +- _str_ - Strings sorted by length. + + -Example reversed (descending longest to shortest): +#### `sort_by_len` ```python -words = ["longest", "short", "longer"] -for word in rtk.iter_sort_by_len(words, reverse=True): - print(word) +def sort_by_len(texts: Iterable[str], + *, + reverse: bool = False) -> tuple[str, ...] ``` -Output: +Sort strings by length. -```text -longest -longer -short -``` +**Arguments**: -### sort_by_len +- `texts` _Iterable[str]_ - Strings to sort. +- `reverse` _bool, optional_ - Sort in descending order (longest to shortest). Defaults to False. -Function to get a tuple of strings sorted by length. +**Returns**: -| Function Signature | -| :------------------------------------------- | -| sort_by_len(package_name, \*, reverse=False) | +- _tuple[str]_ - Strings sorted by length. -| Parameters | | -| :------------------------- | :---------------------------------------------- | -| **texts**_(Iterable[str])_ | Strings to sort. | -| **reverse**_(int)_ | Sort in descending order (longest to shortest). | + -Example (ascending shortest to longest): +#### `ord_to_codepoint` ```python -rtk.sort_by_len(["longest", "short", "longer"]) +def ord_to_codepoint(ordinal: int) -> str ``` -Result: +Character codepoint from character ordinal. -```python -('short', 'longer', 'longest') -``` +**Arguments**: -Example reversed (descending longest to shortest): +- `ordinal` _int_ - Character ordinal. -```python -rtk.sort_by_len(["longest", "short", "longer"], reverse=True) -``` +**Returns**: + +- _str_ - Character codepoint. + + -Result: +#### `codepoint_to_ord` ```python -('longest', 'longer', 'short') +def codepoint_to_ord(codepoint: str) -> int ``` -### ord_to_codepoint +Character ordinal from character codepoint. -Function to get a character codepoint from a character ordinal. +**Arguments**: -| Function Signature | -| :------------------------ | -| ord_to_codepoint(ordinal) | +- `codepoint` _str_ - Character codepoint. -| Parameters | | -| :----------------- | :----------------- | -| **ordinal**_(int)_ | Character ordinal. | +**Returns**: -Example: +- _int_ - Character ordinal. + + + +#### `char_to_codepoint` ```python -# ordinal: 127344 -ordinal = ord("πŸ…°") -rtk.ord_to_codepoint(ordinal) +def char_to_codepoint(char: str) -> str ``` -Result: +Character codepoint from character. + +**Arguments**: + +- `char` _str_ - Character. + +**Returns**: + +- _str_ - Character codepoint. + + + +#### `char_as_exp` ```python -'0001f170' +def char_as_exp(char: str) -> str ``` -### codepoint_to_ord +Create a RE regex expression that exactly matches a character. + +Escape to avoid reserved character classes (i.e. \\s, \\S, \\d, \\D, \\1, etc.). + +**Arguments**: + +- `char` _str_ - Character to match. -Function to get a character ordinal from a character codepoint. +**Returns**: -| Function Signature | -| :-------------------------- | -| codepoint_to_ord(codepoint) | +- _str_ - RE expression that exactly matches the original character. -| Parameters | | -| :------------------- | :------------------- | -| **codepoint**_(str)_ | Character codepoint. | + -Example: +#### `char_as_exp2` ```python -# char: "πŸ…°" -codepoint = "0001f170" -rtk.codepoint_to_ord(codepoint) +def char_as_exp2(char: str) -> str ``` -Result: +Create a RE2 regex expression that exactly matches a character. + +**Arguments**: + +- `char` _str_ - Character to match. + +**Returns**: + +- _str_ - RE2 expression that exactly matches the original character. + + + +#### `string_as_exp` ```python -127344 +def string_as_exp(text: str) -> str ``` -### char_to_codepoint +Create a RE regex expression that exactly matches a string. + +**Arguments**: + +- `text` _str_ - String to match. -Function to get a character codepoint from a character. +**Returns**: -| Function Signature | -| :---------------------- | -| char_to_codepoint(char) | +- _str_ - RE expression that exactly matches the original string. -| Parameters | | -| :-------------- | :--------- | -| **char**_(str)_ | Character. | + -Example: +#### `string_as_exp2` ```python -rtk.char_to_codepoint("πŸ…°") +def string_as_exp2(text: str) -> str ``` -Result: +Create a RE2 regex expression that exactly matches a string. + +**Arguments**: + +- `text` _str_ - String to match. + +**Returns**: + +- _str_ - RE2 expression that exactly matches the original string. + + + +#### `strings_as_exp` ```python -'0001f170' +def strings_as_exp(texts: Iterable[str]) -> str ``` -### char_as_exp +Create a RE regex expression that exactly matches any one string. + +**Arguments**: -Function to create a **RE** expression that exactly matches a character. +- `texts` _Iterable[str]_ - Strings to match. -| Function Signature | -| :----------------- | -| char_as_exp(char) | +**Returns**: -| Parameters | | -| :-------------- | :------------------ | -| **char**_(str)_ | Character to match. | +- _str_ - RE expression that exactly matches any one of the original strings. -Example: + + +#### `strings_as_exp2` ```python -rtk.char_as_exp("πŸ…°") +def strings_as_exp2(texts: Iterable[str]) -> str ``` -Result: +Create a RE2 regex expression that exactly matches any one string. + +**Arguments**: + +- `texts` _Iterable[str]_ - Strings to match. + +**Returns**: + +- _str_ - RE2 expression that exactly matches any one of the original strings. + + + +#### `iter_char_range` ```python -r'\πŸ…°' +def iter_char_range(first_codepoint: int, + last_codepoint: int) -> Iterable[str] ``` -### char_as_exp2 +Iterate all character within a range of codepoints (inclusive). -Function to create a **RE** expression that exactly matches a character. +**Arguments**: -| Function Signature | -| :----------------- | -| char_as_exp2(char) | +- `first_codepoint` _int_ - Starting (first) codepoint. +- `last_codepoint` _int_ - Ending (last) codepoint. -| Parameters | | -| :-------------- | :------------------ | -| **char**_(str)_ | Character to match. | +**Yields**: -Example: +- _str_ - Character from within a range of codepoints. + + + +#### `char_range` ```python -rtk.char_as_exp2("πŸ…°") +def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...] ``` -Result: +Tuple of all character within a range of codepoints (inclusive). + +**Arguments**: + +- `first_codepoint` _int_ - Starting (first) codepoint. +- `last_codepoint` _int_ - Ending (last) codepoint. + +**Returns**: + + tuple[str, ...]: Characters within a range of codepoints. + + + +#### `mask_span` ```python -r'\x{0001f170}' +def mask_span(text: str, + span: list[int] | tuple[int, int], + mask: str | None = None) -> str ``` -### string_as_exp +Slice and mask a string using a single span. + +**Arguments**: -Function to create a **RE** expression that exactly matches a string. +- `text` _str_ - String to slice. +- `span` _list[int] | tuple[int, int]_ - Domain of index positions (start, end) to mask. +- `mask` _str, optional_ - Mask to insert after slicing. Defaults to None. -| Function Signature | -| :------------------ | -| string_as_exp(text) | +**Returns**: -| Parameters | | -| :-------------- | :--------------- | -| **text**_(str)_ | String to match. | +- _str_ - String with span replaced with the mask text. -Example: + + +#### `mask_spans` ```python -rtk.string_as_exp("πŸ…°πŸ…±πŸ…²") +def mask_spans(text: str, + spans: Iterable[list[int] | tuple[int, int]], + masks: Iterable[str] | None = None) -> str ``` -Result: +Slice and mask a string using multiple spans. + +**Arguments**: + +- `text` _str_ - String to slice. +- `spans` _Iterable[list[int] | tuple[int, int]]_ - Domains of index positions (x1, x2) to mask from the text. +- `masks` _Iterable[str], optional_ - Masks to insert when slicing. Defaults to None. + +**Returns**: + +- _str_ - String with all spans replaced with the mask text. + + + +#### `to_nfc` ```python -r'\πŸ…°\πŸ…±\πŸ…²' +def to_nfc(text: str) -> str ``` -### string_as_exp2 +Normalize a Unicode string to NFC form C. + +Form C favors the use of a fully combined character. -Function to create a **RE** expression that exactly matches a string. +**Arguments**: -| Function Signature | -| :------------------- | -| string_as_exp2(text) | +- `text` _str_ - String to normalize. -| Parameters | | -| :-------------- | :--------------- | -| **text**_(str)_ | String to match. | +**Returns**: -Example: +- _str_ - Normalized string. + + + +# `regex_toolkit.base_BAK_2022-11-18` + + + +## `RegexToolkit` Objects ```python -rtk.string_as_exp2("πŸ…°πŸ…±πŸ…²") +class RegexToolkit() ``` -Result: + + +#### `RegexToolkit.char_as_exp` ```python -r'\x{0001f170}\x{0001f171}\x{0001f172}' +@staticmethod +def char_as_exp(char: str) -> str ``` -### strings_as_exp +Create a re Regex Expression that Exactly Matches a Character + +Expressions like \s, \S, \d, \D, \1, etc. are reserved. + +**Arguments**: -Function to create a **RE** expression that exactly matches any one string. +- `char` _str_ - Character to match. -| Function Signature | -| :-------------------- | -| strings_as_exp(texts) | +**Returns**: -| Parameters | | -| :------------------------- | :---------------- | -| **texts**_(Iterable[str])_ | Strings to match. | +- _str_ - re expression that exactly matches the original character. -Example: + + +#### `RegexToolkit.char_as_exp2` ```python -rtk.strings_as_exp([ - "bad.word", - "another-bad-word", -]) +@staticmethod +def char_as_exp2(char: str) -> str ``` -Result: +Create a re2 Regex Expression that Exactly Matches a Character + +**Arguments**: + +- `char` _str_ - Character to match. + +**Returns**: + +- _str_ - re2 expression that exactly matches the original character. + + + +#### `RegexToolkit.string_as_exp` ```python -r'another\-bad\-word|bad\.word' +@staticmethod +def string_as_exp(text: str) -> str ``` -### strings_as_exp2 +Create a re Regex Expression that Exactly Matches a String -Function to create a **RE** expression that exactly matches any one string. +**Arguments**: -| Function Signature | -| :--------------------- | -| strings_as_exp2(texts) | +- `text` _str_ - String to match. -| Parameters | | -| :------------------------- | :---------------- | -| **texts**_(Iterable[str])_ | Strings to match. | +**Returns**: -Example: +- _str_ - re expression that exactly matches the original string. + + + +#### `RegexToolkit.string_as_exp2` ```python -rtk.strings_as_exp2([ - "bad.word", - "another-bad-word", -]) +@staticmethod +def string_as_exp2(text: str) -> str ``` -Result: +Create a re2 Regex Expression that Exactly Matches a String + +**Arguments**: + +- `text` _str_ - String to match. + +**Returns**: + +- _str_ - re2 expression that exactly matches the original string. + + + +#### `RegexToolkit.strings_as_exp` ```python -r'another\-bad\-word|bad\.word' +@staticmethod +def strings_as_exp(texts: Iterable[str]) -> str ``` -### iter_char_range +re + + -Function to iterate all characters within a range of codepoints (inclusive). +#### `RegexToolkit.strings_as_exp2` -| Function | -| :------------------------------------------------- | -| iter_char_range(first_codepoint, second_codepoint) | +```python +@staticmethod +def strings_as_exp2(texts: Iterable[str]) -> str +``` -| Parameters | | -| :------------------------- | :-------------------------- | -| **first_codepoint**_(int)_ | Starting (first) codepoint. | -| **last_codepoint**_(int)_ | Ending (last) codepoint. | +re2 -Example: + + +#### `RegexToolkit.iter_char_range` ```python -for char in rtk.iter_char_range("a", "c"): - print(char) +@staticmethod +def iter_char_range(first_codepoint: int, + last_codepoint: int) -> Iterable[str] ``` -Output: +Iterate All Characters within a Range of Codepoints (Inclusive) -```text -a -b -c -``` +**Arguments**: -### char_range +- `first_codepoint` _int_ - Starting codepoint. +- `last_codepoint` _int_ - Final codepoint. -Function to get a tuple of all characters within a range of codepoints (inclusive). +**Yields**: -| Function | -| :-------------------------------------------- | -| char_range(first_codepoint, second_codepoint) | +- _str_ - Character from within a range of codepoints. -| Parameters | | -| :------------------------- | :-------------------------- | -| **first_codepoint**_(int)_ | Starting (first) codepoint. | -| **last_codepoint**_(int)_ | Ending (last) codepoint. | + -Example: +#### `RegexToolkit.char_range` ```python -rtk.char_range("a", "c") +@staticmethod +def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...] ``` -Result: +Tuple of All Characters within a Range of Codepoints (Inclusive) + +**Arguments**: + +- `first_codepoint` _int_ - Starting codepoint. +- `last_codepoint` _int_ - Final codepoint. + +**Returns**: + + tuple[str, ...]: Characters within a range of codepoints. + + + +#### `RegexToolkit.is_digit` ```python -('a', 'b', 'c') +@staticmethod +def is_digit(char: str) -> bool ``` -### mask_span +Check if a Character is a Digit [0-9] + +**Arguments**: + +- `char` _str_ - Character to check. -Slice and mask a string using a span. +**Returns**: -| Function Signature | -| :------------------------------- | -| mask_span(text, span, mask=None) | +- _bool_ - True if the character is a digit. -| Parameters | | -| :--------------------------------------- | :---------------------------------------------- | -| **text**_(str)_ | Text to slice. | -| **span**_(list[int] \| tuple[int, int])_ | Domain of index positions (start, end) to mask. | -| **mask**_(str \| None)_ | Mask to insert after slicing. | + -Example: +#### `RegexToolkit.mask_span` ```python -rtk.mask_span( - "This is an example", - (8, 8), - mask="not ", -) +@staticmethod +def mask_span(text: str, span, mask: str | None = None) -> str ``` -Result: +Slice and Mask Text using a Span + + + +#### `RegexToolkit.mask_spans` ```python -'This is not an example' +@staticmethod +def mask_spans(text: str, spans: Iterable[Sequence[int]], + masks: Iterable[str]) -> str ``` -### mask_spans +Slice and Mask a String using Multiple Spans -Slice and mask a string using multiple spans. +NOTE: Original values for spans and masks parameters will be modified! + +**Arguments**: -| Function Signature | -| :---------------------------------- | -| mask_spans(text, spans, masks=None) | +- `text` _str_ - Text to slice. +- `spans` _Spans_ - Domains of index positions to mask from the text. +- `masks` _Masks, optional_ - Masks to insert when slicing. Defaults to None. -| Parameters | | -| :-------------------------------------------------- | :--------------------------------------------------------- | -| **text**_(str)_ | Text to slice. | -| **spans**_(Iterable[list[int] \| tuple[int, int]])_ | Domains of index positions (x1, x2) to mask from the text. | -| **masks**_(Iterable[str] \| None)_ | Masks to insert when slicing. | +**Returns**: -Example: +- _str_ - Text with all spans replaced with the mask text. + + + +#### `RegexToolkit.to_utf8` ```python -rtk.mask_spans( - "This is an example", - spans=[ - (9, 10), - (11, 18), - ], - masks=[ - " good", - "sample", - ], -) +@staticmethod +def to_utf8(text: str) -> str ``` -### to_utf8 +Force UTF-8 Text Encoding -Encode a unicode string to UTF-8 form. +**Arguments**: -| Function Signature | -| :----------------- | -| to_utf8(text) | +- `text` _str_ - Text to encode. -| Parameters | | -| :-------------- | :-------------- | -| **text**_(str)_ | Text to encode. | +**Returns**: -### to_nfc +- _str_ - Encoded text. -[Normalize](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize) a Unicode string to NFC form C. -| Function Signature | -| :----------------- | -| to_utf8(text) | +--- -| Parameters | | -| :-------------- | :----------------- | -| **text**_(str)_ | Text to normalize. | diff --git a/ci/.gitignore b/ci/.gitignore new file mode 100644 index 0000000..10b884c --- /dev/null +++ b/ci/.gitignore @@ -0,0 +1,12 @@ +# Ignore everything +* + +# Except +!run_tests.sh +!pre_commit.sh +!condarc.yml + +!deps/ +!deps/* + +!.gitignore diff --git a/ci/condarc.yml b/ci/condarc.yml new file mode 100644 index 0000000..9d750b7 --- /dev/null +++ b/ci/condarc.yml @@ -0,0 +1,32 @@ +# https://docs.conda.io/projects/conda/en/latest/configuration.html + +# always_yes (NoneType, bool) +# aliases: yes +# Automatically choose the 'yes' option whenever asked to proceed with a +# conda operation, such as when running `conda install`. +# +always_yes: true + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 30.0 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 10 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 3 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff --git a/ci/deps/.gitignore b/ci/deps/.gitignore new file mode 100644 index 0000000..ae282f7 --- /dev/null +++ b/ci/deps/.gitignore @@ -0,0 +1,8 @@ +# Ignore everything +* + +# Except +!actions-310.yml +!actions-311.yml + +!.gitignore diff --git a/ci/deps/actions-310.yml b/ci/deps/actions-310.yml new file mode 100644 index 0000000..fbfacf9 --- /dev/null +++ b/ci/deps/actions-310.yml @@ -0,0 +1,11 @@ +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.10 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 \ No newline at end of file diff --git a/ci/deps/actions-311.yml b/ci/deps/actions-311.yml new file mode 100644 index 0000000..3dba7a8 --- /dev/null +++ b/ci/deps/actions-311.yml @@ -0,0 +1,11 @@ +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.11 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 diff --git a/ci/run_tests.sh b/ci/run_tests.sh new file mode 100755 index 0000000..bb97064 --- /dev/null +++ b/ci/run_tests.sh @@ -0,0 +1,22 @@ +#!/bin/bash -e + +# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + +# May help reproduce flaky CI builds if set in subsequent runs +echo PYTHONHASHSEED=$PYTHONHASHSEED + +# If no X server is found, we use xvfb to emulate it +if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then + export DISPLAY=":0" + XVFB="xvfb-run " +fi + +# TODO: Consider adding as an input parameter +PYTEST_TARGET=tests +PYTEST_CMD="${XVFB}pytest -r fEs -s --cov=src --cov-report=xml --cov-append $PYTEST_TARGET" + +echo $PYTEST_CMD +sh -c "$PYTEST_CMD" diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..95f63f7 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,11 @@ +# Ignore everything +* + +# Except +!render_readme.py +!config.json + +!templates/ +!templates/* + +!.gitignore diff --git a/docs/config.json b/docs/config.json new file mode 100644 index 0000000..7415a19 --- /dev/null +++ b/docs/config.json @@ -0,0 +1,22 @@ +{ + "packages_dir": "../src", + "packages": ["regex_toolkit"], + "output_file": "README.md", + "main_template": "main.md.jinja", + "templates_dir": "./templates", + "rendered_filename": "rendered_libs.md", + "template_data": { + "pypi": { + "name": "regex-toolkit", + "full_name": "Regex-Toolkit" + }, + "repo": { + "name": "regex-toolkit", + "full_name": "Regex-Toolkit", + "owner": "Phosmic", + "badge_style": "flat-square", + "primary_branch": "master", + "actions_template": "ubuntu.yml" + } + } +} diff --git a/docs/render_readme.py b/docs/render_readme.py new file mode 100644 index 0000000..fbf9aff --- /dev/null +++ b/docs/render_readme.py @@ -0,0 +1,132 @@ +# import argparse +import json +import logging +import os +import re + +from jinja2 import Environment, FileSystemLoader +from pydoc_markdown import PydocMarkdown +from pydoc_markdown.contrib.loaders.python import PythonLoader +from pydoc_markdown.contrib.processors.crossref import CrossrefProcessor +from pydoc_markdown.contrib.processors.filter import FilterProcessor +from pydoc_markdown.contrib.processors.smart import GoogleProcessor +from pydoc_markdown.contrib.renderers.markdown import MarkdownRenderer +from pydoc_markdown.interfaces import Context + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def load_config(config_file: str) -> dict: + # TODO: Resolve paths here + with open(config_file, mode="r", encoding="utf-8") as file: + return json.loads(file.read()) + + +def render_library_contents( + packages_dir: str, + packages: list[str], + templates_dir: str, + rendered_filename: str, +) -> None: + """Render the Documentation for Python Modules to a File + + Args: + packages_dir (str): Base directory to search for modules. + packages (list[str]): Packages to search for modules. + templates_dir (str): Directory containing the template files. + rendered_filename (str): File to render the library contents to. + """ + output_path = os.path.join(templates_dir, rendered_filename) + session = PydocMarkdown( + loaders=[ + PythonLoader(packages=packages, encoding="utf-8"), + ], + processors=[ + FilterProcessor( + expression="not name.startswith('_') and default()", + documented_only=True, + exclude_private=True, + exclude_special=True, + do_not_filter_modules=True, + skip_empty_modules=True, + ), + GoogleProcessor(), + CrossrefProcessor(), + ], + renderer=MarkdownRenderer( + filename=output_path, + encoding="utf-8", + code_headers=True, + add_method_class_prefix=True, + add_member_class_prefix=True, + signature_code_block=True, + render_typehint_in_data_header=True, + toc_maxdepth=3, + ), + ) + context = Context(packages_dir) + session.init(context) + session.ensure_initialized() + modules = session.load_modules() + session.process(modules) + # session.run_hooks("post-render") + session.render(modules, run_hooks=True) + + # TODO: Fix these "hacks" + + # Read the original + with open(output_path, mode="r", encoding="utf-8") as file: + rendered_contents = file.read() + + # NOTE: Any types containing a "_" will be excluded from this fix + # Fix some missing highlighting in the "**Returns**" and "**Yields**" sections + rendered_contents = re.sub( + r"\*\*(Returns|Yields)\*\*:\n\n ([a-zA-Z0-9, \|\[\]]+): ", + r"**\1**:\n\n- `\2` - ", + rendered_contents, + ) + # Change the Returns and Yields code blocks to italics + rendered_contents = re.sub( + r"\*\*(Returns|Yields)\*\*:\n\n- `([a-zA-Z0-9, \|\[\]]+)` - ", + r"**\1**:\n\n- _\2_ - ", + rendered_contents, + ) + + # Fix trailing newlines with two spaces + rendered_contents = re.sub(r"\n \n", "\n\n", rendered_contents) + + # Condense consecutive newlines to two + rendered_contents = re.sub(r"\n{2,}", "\n\n", rendered_contents) + + # Write the corrected contents + with open(output_path, mode="w", encoding="utf-8") as file: + file.write(rendered_contents) + + +def main() -> None: # config_file: str, template_file: str, output_file: str, replace: bool) -> None: + # Load the config + with open("config.json", mode="r", encoding="utf-8") as file: + config = json.loads(file.read()) + + # Generate the library documentation + render_library_contents( + packages_dir=config["packages_dir"], + packages=config["packages"], + templates_dir=config["templates_dir"], + rendered_filename=config["rendered_filename"], + ) + + # Render the markdown readme + # TODO: Move this to a separate function? + loader = FileSystemLoader(config["templates_dir"]) + environment = Environment(loader=loader, auto_reload=False) + template = environment.get_template(config["main_template"]) + rendered = template.render(**config["template_data"]) + with open(config["output_file"], mode="w", encoding="utf-8") as file: + file.write(rendered) + + +if __name__ == "__main__": + # TODO: Implement argparse here + main() diff --git a/docs/templates/.gitignore b/docs/templates/.gitignore new file mode 100644 index 0000000..2e53636 --- /dev/null +++ b/docs/templates/.gitignore @@ -0,0 +1,14 @@ +# Ignore everything +* + +# Except +!body.md.jinja +!header.md.jinja +!footer.md.jinja +!install.md.jinja +!library.md.jinja +!requirements.md.jinja +!usage.md.jinja +!main.md.jinja + +!.gitignore diff --git a/docs/templates/body.md.jinja b/docs/templates/body.md.jinja new file mode 100644 index 0000000..10b23bc --- /dev/null +++ b/docs/templates/body.md.jinja @@ -0,0 +1,26 @@ +## Requirements: + +{% include 'requirements.md.jinja' %} + +--- + +[Requirements](#requirements) +[Installing](#installing) +[Usage](#usage) +[Library](#library) + +## Installing + +{% with pypi=pypi, repo=repo %}{% include 'install.md.jinja' %}{% endwith %} + +--- + +## Usage + +{% include 'usage.md.jinja' %} + +--- + +## Library + +{% include 'library.md.jinja' %} \ No newline at end of file diff --git a/docs/templates/footer.md.jinja b/docs/templates/footer.md.jinja new file mode 100644 index 0000000..e69de29 diff --git a/docs/templates/header.md.jinja b/docs/templates/header.md.jinja new file mode 100644 index 0000000..40fbde6 --- /dev/null +++ b/docs/templates/header.md.jinja @@ -0,0 +1 @@ +[{{ repo.full_name }}](https://github.com/{{ repo.owner }}/{{ repo.name }}) Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. \ No newline at end of file diff --git a/docs/templates/install.md.jinja b/docs/templates/install.md.jinja new file mode 100644 index 0000000..83d26c3 --- /dev/null +++ b/docs/templates/install.md.jinja @@ -0,0 +1,27 @@ +Most stable version from [**PyPi**](https://pypi.org/project/{{ pypi.name }}/): + +[![PyPI](https://img.shields.io/pypi/v/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) +[![PyPI - License](https://img.shields.io/pypi/l/{{ pypi.name }}?style=flat-square)](https://pypi.org/project/{{ pypi.name }}/) + +```bash +python3 -m pip install {{ pypi.name }} +``` + +Development version from [**GitHub**](https://github.com/{{ repo.owner }}/{{ repo.name }}): + +{% if repo.badge_style %} +![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/{{ repo.owner }}/{{ repo.name }}/{{ repo.actions_template }}?style={{ repo.badge_style }}) +![Codecov](https://img.shields.io/codecov/c/github/{{ repo.owner }}/{{ repo.name }}/{{ repo.primary_branch }}?flag=unittests&style={{ repo.badge_style }}&token=XMJZIW8ZL3) +![GitHub](https://img.shields.io/github/license/{{ repo.owner }}/{{ repo.name }}?style={{ repo.badge_style }}) +{% else %} +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/{{ repo.owner }}/{{ repo.name }}/{{ repo.actions_template }})](https://github.com/{{ repo.owner }}/{{ repo.name }}) +[![Codecov](https://img.shields.io/codecov/c/github/{{ repo.owner }}/{{ repo.name }}/{{ repo.primary_branch }}?flag=unittests&token=XMJZIW8ZL3)](https://github.com/{{ repo.owner }}/{{ repo.name }}) +[![GitHub](https://img.shields.io/github/license/{{ repo.owner }}/{{ repo.name }})](https://github.com/{{ repo.owner }}/{{ repo.name }}) +{% endif %} + +```bash +git clone git+https://github.com/{{ repo.owner }}/{{ repo.name }}.git +cd {{ repo.name }} +python3 -m pip install -e . +``` \ No newline at end of file diff --git a/docs/templates/library.md.jinja b/docs/templates/library.md.jinja new file mode 100644 index 0000000..c0f8264 --- /dev/null +++ b/docs/templates/library.md.jinja @@ -0,0 +1 @@ +{% include 'rendered_libs.md' %} \ No newline at end of file diff --git a/docs/templates/main.md.jinja b/docs/templates/main.md.jinja new file mode 100644 index 0000000..3d9df33 --- /dev/null +++ b/docs/templates/main.md.jinja @@ -0,0 +1,11 @@ +# {{ repo.full_name }} + +{% include 'header.md.jinja' %} + +--- + +{% include 'body.md.jinja' %} + +--- + +{% include 'footer.md.jinja' %} \ No newline at end of file diff --git a/docs/templates/requirements.md.jinja b/docs/templates/requirements.md.jinja new file mode 100644 index 0000000..48546b8 --- /dev/null +++ b/docs/templates/requirements.md.jinja @@ -0,0 +1,13 @@ +**{{ repo.full_name }}** requires Python 3.10 or higher, is platform independent, and has no outside dependencies. + +## Issue reporting + +If you discover an issue with {{ repo.full_name }}, please report it at [https://github.com/{{ repo.owner }}/{{ repo.name }}/issues](https://github.com/{{ repo.owner }}/{{ repo.name }}/issues). + +## License + +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/. \ No newline at end of file diff --git a/docs/templates/usage.md.jinja b/docs/templates/usage.md.jinja new file mode 100644 index 0000000..393c0d8 --- /dev/null +++ b/docs/templates/usage.md.jinja @@ -0,0 +1,11 @@ +Import packages: + +```python +import re +# and/or +import re2 +``` + +```python +import regex_toolkit +``` \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..a846c89 --- /dev/null +++ b/environment.yml @@ -0,0 +1,20 @@ +# Local development dependencies including docs building, website upload, ASV benchmark +name: regex_toolkit +channels: + - conda-forge +dependencies: + - python=3.10 + - pip + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + # - pytest-asyncio>=0.17 + - coverage + + # Code checks + - black=22.10.0 + - flake8=6.0.0 + - isort>=5.2.1 + - mypy=1.0 diff --git a/pyproject.toml b/pyproject.toml index 88635b2..068f981 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["google-re2>=0.2.20210901", "lxml>=4.6.3"] +dependencies = [] dynamic = ["version"] [project.urls] diff --git a/requirements-doc.txt b/requirements-doc.txt new file mode 100644 index 0000000..8a39850 --- /dev/null +++ b/requirements-doc.txt @@ -0,0 +1,2 @@ +novella==0.2.3 +pydoc-markdown==4.6.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 2c04005..42c2e30 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os -REQUIRED_PYTHON = (3, 9) +REQUIRED_PYTHON = (3, 10) CURRENT_PYTHON = sys.version_info[:2] if CURRENT_PYTHON < REQUIRED_PYTHON: @@ -13,7 +13,7 @@ ========================== Unsupported Python version ========================== -This version of Regex Toolkit requires at least Python {}.{}, but you're trying to install it on Python {}.{}. +This version of Regex-Toolkit requires at least Python {}.{}, but you're trying to install it on Python {}.{}. """.format( *(REQUIRED_PYTHON + CURRENT_PYTHON) ) @@ -22,38 +22,7 @@ here = os.path.abspath(os.path.dirname(__file__)) -if sys.argv[-1] == "build": - # Build - status = os.system("python3 -m build") - sys.exit(status) -elif sys.argv[-1] == "publish": - # Build and publish - status = os.system("python3 -m build") - if status == 0: - status = os.system( - " ".join( - [ - "twine upload", - os.path.join(here, "dist", "regex_toolkit-*.tar.gz"), - os.path.join(here, "dist", "regex_toolkit-*.whl"), - ] - ) - ) - sys.exit(status) -elif sys.argv[-1] == "test": - # Test - import unittest - - # Default shared TestLoader instance - test_loader = unittest.defaultTestLoader - # Basic test runner that outputs to sys.stderr - test_runner = unittest.TextTestRunner() - # Discover all tests - test_suite = test_loader.discover(os.path.join(here, "tests")) - # Run the test suite - test_runner.run(test_suite) -else: - # Legacy install - from setuptools import setup +# Legacy install +from setuptools import setup - setup() +setup() diff --git a/src/regex_toolkit/.gitignore b/src/regex_toolkit/.gitignore index 5981298..5f3e828 100644 --- a/src/regex_toolkit/.gitignore +++ b/src/regex_toolkit/.gitignore @@ -4,5 +4,6 @@ # Except !__init__.py !base.py +!constants.py !.gitignore diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index 36366a9..ff07dce 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -1,3 +1,40 @@ -from .base import * +from .base import ( + char_as_exp, + char_as_exp2, + char_range, + char_to_codepoint, + codepoint_to_ord, + iter_char_range, + iter_sort_by_len, + mask_span, + mask_spans, + ord_to_codepoint, + sort_by_len, + string_as_exp, + string_as_exp2, + strings_as_exp, + strings_as_exp2, + to_nfc, + to_utf8, +) +__all__ = [ + "char_as_exp", + "char_as_exp2", + "char_range", + "char_to_codepoint", + "codepoint_to_ord", + "iter_char_range", + "iter_sort_by_len", + "mask_span", + "mask_spans", + "ord_to_codepoint", + "sort_by_len", + "string_as_exp", + "string_as_exp2", + "strings_as_exp", + "strings_as_exp2", + "to_nfc", + "to_utf8", +] __version__ = "0.0.3" diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 845e4bf..75599a0 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -1,30 +1,26 @@ __all__ = [ - "iter_sort_by_len", - "sort_by_len", - "ord_to_codepoint", - "codepoint_to_ord", - "char_to_codepoint", "char_as_exp", "char_as_exp2", + "char_range", + "char_to_codepoint", + "codepoint_to_ord", + "iter_char_range", + "iter_sort_by_len", + "mask_span", + "mask_spans", + "ord_to_codepoint", + "sort_by_len", "string_as_exp", "string_as_exp2", "strings_as_exp", "strings_as_exp2", - "iter_char_range", - "mask_span", - "mask_spans", - "to_utf8", "to_nfc", + "to_utf8", ] -import string import unicodedata - from collections.abc import Iterable -_ALPHA_CHARS: set[str] = set(string.ascii_letters) -_DIGIT_CHARTS: set[str] = set(string.digits) -_SAFE_CHARS: set[str] = _ALPHA_CHARS.union(_DIGIT_CHARTS).union(set(string.whitespace)) -_RE2_ESCAPABLE_CHARS: set[str] = set(string.punctuation) +from regex_toolkit.constants import RE2_ESCAPABLE_CHARS, SAFE_CHARS def iter_sort_by_len( @@ -32,7 +28,7 @@ def iter_sort_by_len( *, reverse: bool = False, ) -> Iterable[str]: - """Iterate Texts Sorted by Length + """Iterate strings sorted by length. Args: texts (Iterable[str]): Strings to sort. @@ -50,7 +46,7 @@ def sort_by_len( *, reverse: bool = False, ) -> tuple[str, ...]: - """Strings Sorted by Length + """Sort strings by length. Args: texts (Iterable[str]): Strings to sort. @@ -63,7 +59,7 @@ def sort_by_len( def ord_to_codepoint(ordinal: int) -> str: - """Character Codepoint from Character Ordinal + """Character codepoint from character ordinal. Args: ordinal (int): Character ordinal. @@ -75,7 +71,7 @@ def ord_to_codepoint(ordinal: int) -> str: def codepoint_to_ord(codepoint: str) -> int: - """Character Ordinal from Character Codepoint + """Character ordinal from character codepoint. Args: codepoint (str): Character codepoint. @@ -87,7 +83,7 @@ def codepoint_to_ord(codepoint: str) -> int: def char_to_codepoint(char: str) -> str: - """Character Codepoint from Character + """Character codepoint from character. Args: char (str): Character. @@ -99,9 +95,9 @@ def char_to_codepoint(char: str) -> str: def char_as_exp(char: str) -> str: - """Create a RE Regex Expression that Exactly Matches a Character + """Create a RE regex expression that exactly matches a character. - Escape to avoid reserved character classes (i.e. \s, \S, \d, \D, \1, etc.). + Escape to avoid reserved character classes (i.e. \\s, \\S, \\d, \\D, \\1, etc.). Args: char (str): Character to match. @@ -109,7 +105,7 @@ def char_as_exp(char: str) -> str: Returns: str: RE expression that exactly matches the original character. """ - if char in _SAFE_CHARS: + if char in SAFE_CHARS: # Safe as-is return char else: @@ -118,7 +114,7 @@ def char_as_exp(char: str) -> str: def char_as_exp2(char: str) -> str: - """Create a RE2 Regex Expression that Exactly Matches a Character + """Create a RE2 regex expression that exactly matches a character. Args: char (str): Character to match. @@ -126,10 +122,10 @@ def char_as_exp2(char: str) -> str: Returns: str: RE2 expression that exactly matches the original character. """ - if char in _SAFE_CHARS: + if char in SAFE_CHARS: # Safe as-is return char - elif char in _RE2_ESCAPABLE_CHARS: + elif char in RE2_ESCAPABLE_CHARS: # Safe to escape with backslash return f"\\{char}" else: @@ -138,7 +134,7 @@ def char_as_exp2(char: str) -> str: def string_as_exp(text: str) -> str: - """Create a RE Regex Expression that Exactly Matches a String + """Create a RE regex expression that exactly matches a string. Args: text (str): String to match. @@ -150,7 +146,7 @@ def string_as_exp(text: str) -> str: def string_as_exp2(text: str) -> str: - """Create a RE2 Regex Expression that Exactly Matches a String + """Create a RE2 regex expression that exactly matches a string. Args: text (str): String to match. @@ -162,7 +158,7 @@ def string_as_exp2(text: str) -> str: def strings_as_exp(texts: Iterable[str]) -> str: - """Create a RE Regex expression that Exactly Matches Any One String + """Create a RE regex expression that exactly matches any one string. Args: texts (Iterable[str]): Strings to match. @@ -179,7 +175,7 @@ def strings_as_exp(texts: Iterable[str]) -> str: def strings_as_exp2(texts: Iterable[str]) -> str: - """Create a RE2 Regex expression that Exactly Matches Any One String + """Create a RE2 regex expression that exactly matches any one string. Args: texts (Iterable[str]): Strings to match. @@ -196,7 +192,7 @@ def strings_as_exp2(texts: Iterable[str]) -> str: def iter_char_range(first_codepoint: int, last_codepoint: int) -> Iterable[str]: - """Iterate All Characters within a Range of Codepoints (Inclusive) + """Iterate all character within a range of codepoints (inclusive). Args: first_codepoint (int): Starting (first) codepoint. @@ -210,7 +206,7 @@ def iter_char_range(first_codepoint: int, last_codepoint: int) -> Iterable[str]: def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...]: - """Tuple of All Characters within a Range of Codepoints (Inclusive) + """Tuple of all character within a range of codepoints (inclusive). Args: first_codepoint (int): Starting (first) codepoint. @@ -227,15 +223,15 @@ def mask_span( span: list[int] | tuple[int, int], mask: str | None = None, ) -> str: - """Slice and Mask a String using a Span + """Slice and mask a string using a single span. Args: - text (str): Text to slice. + text (str): String to slice. span (list[int] | tuple[int, int]): Domain of index positions (start, end) to mask. mask (str, optional): Mask to insert after slicing. Defaults to None. Returns: - str: Text with span replaced with the mask text. + str: String with span replaced with the mask text. """ if not 0 <= span[0] <= span[1] <= len(text): raise ValueError(f"Invalid index positions for start and end: {span}") @@ -252,15 +248,15 @@ def mask_spans( spans: Iterable[list[int] | tuple[int, int]], masks: Iterable[str] | None = None, ) -> str: - """Slice and Mask a String using Multiple Spans + """Slice and mask a string using multiple spans. Args: - text (str): Text to slice. + text (str): String to slice. spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask from the text. masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. Returns: - str: Text with all spans replaced with the mask text. + str: String with all spans replaced with the mask text. """ if masks is None: # No masks @@ -279,7 +275,7 @@ def to_utf8(text): def to_nfc(text: str) -> str: - """Normalize a Unicode String to NFC Form C + """Normalize a Unicode string to NFC form C. Form C favors the use of a fully combined character. diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py new file mode 100644 index 0000000..d21f955 --- /dev/null +++ b/src/regex_toolkit/constants.py @@ -0,0 +1,21 @@ +"""Constant values. + +This module contains constant values used throughout the project. +""" + +import string +from typing import Final + +__all__ = [ + "ALPHA_CHARS", + "DIGIT_CHARTS", + "SAFE_CHARS", + "RE2_ESCAPABLE_CHARS", +] + +ALPHA_CHARS: Final[set[str]] = set(string.ascii_letters) +DIGIT_CHARTS: Final[set[str]] = set(string.digits) +SAFE_CHARS: Final[set[str]] = ALPHA_CHARS.union(DIGIT_CHARTS).union( + set(string.whitespace) +) +RE2_ESCAPABLE_CHARS: Final[set[str]] = set(string.punctuation) diff --git a/tests/test_functions.py b/tests/test_functions.py index a03953b..a0afba8 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,9 +1,9 @@ import unittest -import regex_toolkit - from collections.abc import Iterable from itertools import product +import regex_toolkit + class TestStringMethods(unittest.TestCase): def test_iter_sort_by_len(self): @@ -131,27 +131,27 @@ def test_sort_by_len(self): prev_len = len(text) def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit._safe_chars) + text = "".join(regex_toolkit.constants.SAFE_CHARS) actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit._safe_chars) + expected_exp = "".join(regex_toolkit.constants.SAFE_CHARS) self.assertEqual(actual_exp, expected_exp) def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit._escapable_chars) + text = "".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit._escapable_chars) + expected_exp = "\\" + "\\".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) self.assertEqual(actual_exp, expected_exp) def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit._safe_chars) + text = "".join(regex_toolkit.constants.SAFE_CHARS) actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit._safe_chars) + expected_exp = "".join(regex_toolkit.constants.SAFE_CHARS) self.assertEqual(actual_exp, expected_exp) def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit._escapable_chars) + text = "".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit._escapable_chars) + expected_exp = "\\" + "\\".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) self.assertEqual(actual_exp, expected_exp) def test_iter_char_range(self): From 4e80e72aa0276b15fddf326d5d3c36c65cfb7223 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 12 Mar 2023 06:21:29 -0500 Subject: [PATCH 3/7] Update overall module description. Moved helpers into `utils.py`. Moved constants into `constants.py`. Added `enums.py`. Rename `char_as_exp` to simply `escape`, as it is a more accurate name. Now sharing pattern building functions for both RE and RE2 to be specified by an enum (this could change in the future). Naming that involves "codepoint" will now be used as "cpoint" instead to save line room. Added tests. Updated readme. --- README.md | 406 ++++------------- docs/templates/header.md.jinja | 2 +- pyproject.toml | 4 +- src/regex_toolkit/.gitignore | 2 + src/regex_toolkit/__init__.py | 28 +- src/regex_toolkit/base.py | 290 +++--------- src/regex_toolkit/constants.py | 15 +- src/regex_toolkit/enums.py | 15 + src/regex_toolkit/utils.py | 208 +++++++++ tests/test_functions.py | 783 +++++++++++++++++++++------------ 10 files changed, 898 insertions(+), 855 deletions(-) create mode 100644 src/regex_toolkit/enums.py create mode 100644 src/regex_toolkit/utils.py diff --git a/README.md b/README.md index ab34358..bdfe0e2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Regex-Toolkit -[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit) Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. +[Regex-Toolkit](https://github.com/Phosmic/regex-toolkit) provides tools for creating [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions. --- @@ -73,18 +73,18 @@ import regex_toolkit ## Library - + -# `regex_toolkit.base` +# `regex_toolkit.utils` - + #### `iter_sort_by_len` ```python def iter_sort_by_len(texts: Iterable[str], *, - reverse: bool = False) -> Iterable[str] + reverse: bool = False) -> Generator[str, None, None] ``` Iterate strings sorted by length. @@ -98,7 +98,7 @@ Iterate strings sorted by length. - _str_ - Strings sorted by length. - + #### `sort_by_len` @@ -117,18 +117,25 @@ Sort strings by length. **Returns**: -- _tuple[str]_ - Strings sorted by length. + tuple[str, ...]: Strings sorted by length. - + -#### `ord_to_codepoint` +#### `ord_to_cpoint` ```python -def ord_to_codepoint(ordinal: int) -> str +def ord_to_cpoint(ordinal: int) -> str ``` Character codepoint from character ordinal. +**Example**: + + ```python + # Output: '00000061' + ord_to_cpoint(97) + ``` + **Arguments**: - `ordinal` _int_ - Character ordinal. @@ -137,192 +144,109 @@ Character codepoint from character ordinal. - _str_ - Character codepoint. - + -#### `codepoint_to_ord` +#### `cpoint_to_ord` ```python -def codepoint_to_ord(codepoint: str) -> int +def cpoint_to_ord(cpoint: str) -> int ``` Character ordinal from character codepoint. **Arguments**: -- `codepoint` _str_ - Character codepoint. +- `cpoint` _str_ - Character codepoint. **Returns**: - _int_ - Character ordinal. - + -#### `char_to_codepoint` +#### `char_to_cpoint` ```python -def char_to_codepoint(char: str) -> str +def char_to_cpoint(char: str) -> str ``` Character codepoint from character. -**Arguments**: +**Example**: -- `char` _str_ - Character. - -**Returns**: - -- _str_ - Character codepoint. - - - -#### `char_as_exp` - -```python -def char_as_exp(char: str) -> str -``` - -Create a RE regex expression that exactly matches a character. - -Escape to avoid reserved character classes (i.e. \\s, \\S, \\d, \\D, \\1, etc.). + ```python + # Output: '00000061' + char_to_cpoint("a") + ``` **Arguments**: -- `char` _str_ - Character to match. - -**Returns**: - -- _str_ - RE expression that exactly matches the original character. - - - -#### `char_as_exp2` - -```python -def char_as_exp2(char: str) -> str -``` - -Create a RE2 regex expression that exactly matches a character. - -**Arguments**: - -- `char` _str_ - Character to match. - -**Returns**: - -- _str_ - RE2 expression that exactly matches the original character. - - - -#### `string_as_exp` - -```python -def string_as_exp(text: str) -> str -``` - -Create a RE regex expression that exactly matches a string. - -**Arguments**: - -- `text` _str_ - String to match. - -**Returns**: - -- _str_ - RE expression that exactly matches the original string. - - - -#### `string_as_exp2` - -```python -def string_as_exp2(text: str) -> str -``` - -Create a RE2 regex expression that exactly matches a string. - -**Arguments**: - -- `text` _str_ - String to match. +- `char` _str_ - Character. **Returns**: -- _str_ - RE2 expression that exactly matches the original string. +- _str_ - Character codepoint. - + -#### `strings_as_exp` +#### `to_nfc` ```python -def strings_as_exp(texts: Iterable[str]) -> str +def to_nfc(text: str) -> str ``` -Create a RE regex expression that exactly matches any one string. - -**Arguments**: - -- `texts` _Iterable[str]_ - Strings to match. - -**Returns**: - -- _str_ - RE expression that exactly matches any one of the original strings. - - - -#### `strings_as_exp2` - -```python -def strings_as_exp2(texts: Iterable[str]) -> str -``` +Normalize a Unicode string to NFC form C. -Create a RE2 regex expression that exactly matches any one string. +Form C favors the use of a fully combined character. **Arguments**: -- `texts` _Iterable[str]_ - Strings to match. +- `text` _str_ - String to normalize. **Returns**: -- _str_ - RE2 expression that exactly matches any one of the original strings. +- _str_ - Normalized string. - + #### `iter_char_range` ```python -def iter_char_range(first_codepoint: int, - last_codepoint: int) -> Iterable[str] +def iter_char_range(first_cpoint: int, + last_cpoint: int) -> Generator[str, None, None] ``` Iterate all character within a range of codepoints (inclusive). **Arguments**: -- `first_codepoint` _int_ - Starting (first) codepoint. -- `last_codepoint` _int_ - Ending (last) codepoint. +- `first_cpoint` _int_ - Starting (first) codepoint. +- `last_cpoint` _int_ - Ending (last) codepoint. **Yields**: - _str_ - Character from within a range of codepoints. - + #### `char_range` ```python -def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...] +def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...] ``` Tuple of all character within a range of codepoints (inclusive). **Arguments**: -- `first_codepoint` _int_ - Starting (first) codepoint. -- `last_codepoint` _int_ - Ending (last) codepoint. +- `first_cpoint` _int_ - Starting (first) codepoint. +- `last_cpoint` _int_ - Ending (last) codepoint. **Returns**: tuple[str, ...]: Characters within a range of codepoints. - + #### `mask_span` @@ -344,7 +268,11 @@ Slice and mask a string using a single span. - _str_ - String with span replaced with the mask text. - +**Raises**: + +- `ValueError` - Invalid index positions for start and end. + + #### `mask_spans` @@ -356,6 +284,8 @@ def mask_spans(text: str, Slice and mask a string using multiple spans. +Todo: Add support for overlapping (and unordered?) spans. + **Arguments**: - `text` _str_ - String to slice. @@ -366,251 +296,103 @@ Slice and mask a string using multiple spans. - _str_ - String with all spans replaced with the mask text. - +**Raises**: -#### `to_nfc` - -```python -def to_nfc(text: str) -> str -``` - -Normalize a Unicode string to NFC form C. - -Form C favors the use of a fully combined character. +- `ValueError` - Invalid index positions for start and end. -**Arguments**: - -- `text` _str_ - String to normalize. - -**Returns**: - -- _str_ - Normalized string. - - - -# `regex_toolkit.base_BAK_2022-11-18` - - - -## `RegexToolkit` Objects - -```python -class RegexToolkit() -``` - - - -#### `RegexToolkit.char_as_exp` - -```python -@staticmethod -def char_as_exp(char: str) -> str -``` - -Create a re Regex Expression that Exactly Matches a Character - -Expressions like \s, \S, \d, \D, \1, etc. are reserved. - -**Arguments**: - -- `char` _str_ - Character to match. - -**Returns**: + -- _str_ - re expression that exactly matches the original character. +# `regex_toolkit.base` - + -#### `RegexToolkit.char_as_exp2` +#### `escape` ```python -@staticmethod -def char_as_exp2(char: str) -> str +def escape(char: str, flavor: int = 1) -> str ``` -Create a re2 Regex Expression that Exactly Matches a Character +Create a regex expression that exactly matches a character. **Arguments**: - `char` _str_ - Character to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. **Returns**: -- _str_ - re2 expression that exactly matches the original character. - - - -#### `RegexToolkit.string_as_exp` - -```python -@staticmethod -def string_as_exp(text: str) -> str -``` - -Create a re Regex Expression that Exactly Matches a String - -**Arguments**: - -- `text` _str_ - String to match. +- _str_ - Expression that exactly matches the original character. -**Returns**: +**Raises**: -- _str_ - re expression that exactly matches the original string. +- `ValueError` - Invalid regex flavor. - + -#### `RegexToolkit.string_as_exp2` +#### `string_as_exp` ```python -@staticmethod -def string_as_exp2(text: str) -> str +def string_as_exp(text: str, flavor: int = 1) -> str ``` -Create a re2 Regex Expression that Exactly Matches a String +Create a regex expression that exactly matches a string. **Arguments**: - `text` _str_ - String to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. **Returns**: -- _str_ - re2 expression that exactly matches the original string. - - - -#### `RegexToolkit.strings_as_exp` - -```python -@staticmethod -def strings_as_exp(texts: Iterable[str]) -> str -``` - -re - - - -#### `RegexToolkit.strings_as_exp2` - -```python -@staticmethod -def strings_as_exp2(texts: Iterable[str]) -> str -``` - -re2 - - - -#### `RegexToolkit.iter_char_range` - -```python -@staticmethod -def iter_char_range(first_codepoint: int, - last_codepoint: int) -> Iterable[str] -``` - -Iterate All Characters within a Range of Codepoints (Inclusive) - -**Arguments**: - -- `first_codepoint` _int_ - Starting codepoint. -- `last_codepoint` _int_ - Final codepoint. - -**Yields**: - -- _str_ - Character from within a range of codepoints. - - - -#### `RegexToolkit.char_range` - -```python -@staticmethod -def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...] -``` - -Tuple of All Characters within a Range of Codepoints (Inclusive) - -**Arguments**: +- _str_ - Expression that exactly matches the original string. -- `first_codepoint` _int_ - Starting codepoint. -- `last_codepoint` _int_ - Final codepoint. +**Raises**: -**Returns**: - - tuple[str, ...]: Characters within a range of codepoints. +- `ValueError` - Invalid regex flavor. - + -#### `RegexToolkit.is_digit` +#### `strings_as_exp` ```python -@staticmethod -def is_digit(char: str) -> bool +def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str ``` -Check if a Character is a Digit [0-9] +Create a regex expression that exactly matches any one string. **Arguments**: -- `char` _str_ - Character to check. +- `texts` _Iterable[str]_ - Strings to match. +- `flavor` _int, optional_ - Regex flavor (1 for RE, 2 for RE2). Defaults to 1. **Returns**: -- _bool_ - True if the character is a digit. - - - -#### `RegexToolkit.mask_span` - -```python -@staticmethod -def mask_span(text: str, span, mask: str | None = None) -> str -``` - -Slice and Mask Text using a Span +- _str_ - Expression that exactly matches any one of the original strings. - +**Raises**: -#### `RegexToolkit.mask_spans` +- `ValueError` - Invalid regex flavor. -```python -@staticmethod -def mask_spans(text: str, spans: Iterable[Sequence[int]], - masks: Iterable[str]) -> str -``` - -Slice and Mask a String using Multiple Spans - -NOTE: Original values for spans and masks parameters will be modified! - -**Arguments**: + -- `text` _str_ - Text to slice. -- `spans` _Spans_ - Domains of index positions to mask from the text. -- `masks` _Masks, optional_ - Masks to insert when slicing. Defaults to None. +# `regex_toolkit.enums` -**Returns**: - -- _str_ - Text with all spans replaced with the mask text. +Enums. - + -#### `RegexToolkit.to_utf8` +## `RegexFlavor` Objects ```python -@staticmethod -def to_utf8(text: str) -> str +class RegexFlavor(int, Enum) ``` -Force UTF-8 Text Encoding - -**Arguments**: - -- `text` _str_ - Text to encode. +Regex flavors. -**Returns**: +**Attributes**: -- _str_ - Encoded text. +- `RE` _int_ - Standard Python regex flavor. +- `RE2` _int_ - Google RE2 regex flavor. --- diff --git a/docs/templates/header.md.jinja b/docs/templates/header.md.jinja index 40fbde6..c4e70d9 100644 --- a/docs/templates/header.md.jinja +++ b/docs/templates/header.md.jinja @@ -1 +1 @@ -[{{ repo.full_name }}](https://github.com/{{ repo.owner }}/{{ repo.name }}) Effortlessly craft efficient [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions with user-friendly tools. \ No newline at end of file +[{{ repo.full_name }}](https://github.com/{{ repo.owner }}/{{ repo.name }}) provides tools for creating [RE](https://docs.python.org/3/library/re.html) and [RE2](https://github.com/google/re2) expressions. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 068f981..3806815 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "regex_toolkit" -description = "Effortlessly craft efficient RE and RE2 expressions with user-friendly tools" +description = "Tools for creating RE and RE2 expressions" readme = "README.md" requires-python = ">=3.10,<4" license = { file = 'LICENSE' } -keywords = ["re", "re2", "expression", "regex", "pattern", "tool", "toolkit"] +keywords = ["re", "re2", "regex", "expression", "pattern", "tool", "toolkit"] authors = [{ name = "The Phosmic Development Team", email = "dev@phosmic.com" }] classifiers = [ "Development Status :: 3 - Alpha", diff --git a/src/regex_toolkit/.gitignore b/src/regex_toolkit/.gitignore index 5f3e828..69400b7 100644 --- a/src/regex_toolkit/.gitignore +++ b/src/regex_toolkit/.gitignore @@ -5,5 +5,7 @@ !__init__.py !base.py !constants.py +!enums.py +!utils.py !.gitignore diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index ff07dce..d878546 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -1,39 +1,35 @@ from .base import ( - char_as_exp, - char_as_exp2, + escape, + string_as_exp, + strings_as_exp, +) +from .utils import ( char_range, - char_to_codepoint, - codepoint_to_ord, + char_to_cpoint, + cpoint_to_ord, iter_char_range, iter_sort_by_len, mask_span, mask_spans, - ord_to_codepoint, + ord_to_cpoint, sort_by_len, - string_as_exp, - string_as_exp2, - strings_as_exp, - strings_as_exp2, to_nfc, to_utf8, ) __all__ = [ - "char_as_exp", - "char_as_exp2", + "escape", "char_range", - "char_to_codepoint", - "codepoint_to_ord", + "char_to_cpoint", + "cpoint_to_ord", "iter_char_range", "iter_sort_by_len", "mask_span", "mask_spans", - "ord_to_codepoint", + "ord_to_cpoint", "sort_by_len", "string_as_exp", - "string_as_exp2", "strings_as_exp", - "strings_as_exp2", "to_nfc", "to_utf8", ] diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 75599a0..7c17b18 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -1,110 +1,41 @@ __all__ = [ - "char_as_exp", - "char_as_exp2", - "char_range", - "char_to_codepoint", - "codepoint_to_ord", - "iter_char_range", - "iter_sort_by_len", - "mask_span", - "mask_spans", - "ord_to_codepoint", - "sort_by_len", + "escape", "string_as_exp", - "string_as_exp2", "strings_as_exp", - "strings_as_exp2", - "to_nfc", - "to_utf8", ] -import unicodedata from collections.abc import Iterable -from regex_toolkit.constants import RE2_ESCAPABLE_CHARS, SAFE_CHARS +from regex_toolkit.constants import ESCAPE_CHARS, SAFE_CHARS +from regex_toolkit.enums import RegexFlavor +from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len -def iter_sort_by_len( - texts: Iterable[str], - *, - reverse: bool = False, -) -> Iterable[str]: - """Iterate strings sorted by length. +def escape(char: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a character. Args: - texts (Iterable[str]): Strings to sort. - reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. - - Yields: - str: Strings sorted by length. - """ - for text in sorted(texts, key=len, reverse=reverse): - yield text - - -def sort_by_len( - texts: Iterable[str], - *, - reverse: bool = False, -) -> tuple[str, ...]: - """Sort strings by length. - - Args: - texts (Iterable[str]): Strings to sort. - reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. - - Returns: - tuple[str]: Strings sorted by length. - """ - return tuple(iter_sort_by_len(texts, reverse=reverse)) - - -def ord_to_codepoint(ordinal: int) -> str: - """Character codepoint from character ordinal. - - Args: - ordinal (int): Character ordinal. + char (str): Character to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - str: Character codepoint. - """ - return format(ordinal, "x").zfill(8) - - -def codepoint_to_ord(codepoint: str) -> int: - """Character ordinal from character codepoint. + str: Expression that exactly matches the original character. - Args: - codepoint (str): Character codepoint. - - Returns: - int: Character ordinal. + Raises: + ValueError: Invalid regex flavor. """ - return int(codepoint, 16) - - -def char_to_codepoint(char: str) -> str: - """Character codepoint from character. + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") - Args: - char (str): Character. - - Returns: - str: Character codepoint. - """ - return ord_to_codepoint(ord(char)) - - -def char_as_exp(char: str) -> str: - """Create a RE regex expression that exactly matches a character. + if flavor == RegexFlavor.RE: + return _escape(char) + # elif flavor == RegexFlavor.RE2: + else: + return _escape2(char) - Escape to avoid reserved character classes (i.e. \\s, \\S, \\d, \\D, \\1, etc.). - Args: - char (str): Character to match. - - Returns: - str: RE expression that exactly matches the original character. - """ +def _escape(char: str) -> str: if char in SAFE_CHARS: # Safe as-is return char @@ -113,176 +44,79 @@ def char_as_exp(char: str) -> str: return f"\\{char}" -def char_as_exp2(char: str) -> str: - """Create a RE2 regex expression that exactly matches a character. - - Args: - char (str): Character to match. - - Returns: - str: RE2 expression that exactly matches the original character. - """ +def _escape2(char: str) -> str: if char in SAFE_CHARS: # Safe as-is return char - elif char in RE2_ESCAPABLE_CHARS: + elif char in ESCAPE_CHARS: # Safe to escape with backslash return f"\\{char}" else: # Otherwise escape using the codepoint - return "\\x{" + char_to_codepoint(char) + "}" - - -def string_as_exp(text: str) -> str: - """Create a RE regex expression that exactly matches a string. - - Args: - text (str): String to match. - - Returns: - str: RE expression that exactly matches the original string. - """ - return r"".join(map(char_as_exp, text)) + return "\\x{" + char_to_cpoint(char) + "}" -def string_as_exp2(text: str) -> str: - """Create a RE2 regex expression that exactly matches a string. +def string_as_exp(text: str, flavor: int = 1) -> str: + """Create a regex expression that exactly matches a string. Args: text (str): String to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - str: RE2 expression that exactly matches the original string. - """ - return r"".join(map(char_as_exp2, text)) - - -def strings_as_exp(texts: Iterable[str]) -> str: - """Create a RE regex expression that exactly matches any one string. - - Args: - texts (Iterable[str]): Strings to match. + str: Expression that exactly matches the original string. - Returns: - str: RE expression that exactly matches any one of the original strings. + Raises: + ValueError: Invalid regex flavor. """ - return r"|".join( - map( - string_as_exp, - iter_sort_by_len(texts, reverse=True), - ) - ) - - -def strings_as_exp2(texts: Iterable[str]) -> str: - """Create a RE2 regex expression that exactly matches any one string. - - Args: - texts (Iterable[str]): Strings to match. + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") - Returns: - str: RE2 expression that exactly matches any one of the original strings. - """ - return r"|".join( - map( - string_as_exp2, - iter_sort_by_len(texts, reverse=True), - ) - ) + if flavor == RegexFlavor.RE: + return _string_as_exp(text) + # elif flavor == RegexFlavor.RE2: + else: + return _string_as_exp2(text) -def iter_char_range(first_codepoint: int, last_codepoint: int) -> Iterable[str]: - """Iterate all character within a range of codepoints (inclusive). +def _string_as_exp(text: str) -> str: + return r"".join(map(_escape, text)) - Args: - first_codepoint (int): Starting (first) codepoint. - last_codepoint (int): Ending (last) codepoint. - Yields: - str: Character from within a range of codepoints. - """ - for i in range(ord(first_codepoint), ord(last_codepoint) + 1): - yield chr(i) +def _string_as_exp2(text: str) -> str: + return r"".join(map(_escape2, text)) -def char_range(first_codepoint: int, last_codepoint: int) -> tuple[str, ...]: - """Tuple of all character within a range of codepoints (inclusive). +def strings_as_exp(texts: Iterable[str], flavor: int = 1) -> str: + """Create a regex expression that exactly matches any one string. Args: - first_codepoint (int): Starting (first) codepoint. - last_codepoint (int): Ending (last) codepoint. + texts (Iterable[str]): Strings to match. + flavor (int, optional): Regex flavor (1 for RE, 2 for RE2). Defaults to 1. Returns: - tuple[str, ...]: Characters within a range of codepoints. - """ - return tuple(iter_char_range(first_codepoint, last_codepoint)) - - -def mask_span( - text: str, - span: list[int] | tuple[int, int], - mask: str | None = None, -) -> str: - """Slice and mask a string using a single span. - - Args: - text (str): String to slice. - span (list[int] | tuple[int, int]): Domain of index positions (start, end) to mask. - mask (str, optional): Mask to insert after slicing. Defaults to None. + str: Expression that exactly matches any one of the original strings. - Returns: - str: String with span replaced with the mask text. + Raises: + ValueError: Invalid regex flavor. """ - if not 0 <= span[0] <= span[1] <= len(text): - raise ValueError(f"Invalid index positions for start and end: {span}") - if mask is None: - # No mask - return text[: span[0]] + text[span[1] :] - else: - # Use mask - return text[: span[0]] + mask + text[span[1] :] - + try: + flavor = RegexFlavor(flavor) + except ValueError: + raise ValueError(f"Invalid regex flavor: {flavor}") -def mask_spans( - text: str, - spans: Iterable[list[int] | tuple[int, int]], - masks: Iterable[str] | None = None, -) -> str: - """Slice and mask a string using multiple spans. - - Args: - text (str): String to slice. - spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask from the text. - masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. - - Returns: - str: String with all spans replaced with the mask text. - """ - if masks is None: - # No masks - for span in reversed(spans): - text = mask_span(text, span, mask=None) + if flavor == RegexFlavor.RE: + return _strings_as_exp(texts) + # elif flavor == RegexFlavor.RE2: else: - # Has mask - for span, mask in zip(reversed(spans), reversed(masks)): - text = mask_span(text, span, mask=mask) - - return text - + return _strings_as_exp2(texts) -def to_utf8(text): - return text.encode("utf-8").decode("utf-8") +def _strings_as_exp(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp, iter_sort_by_len(texts, reverse=True))) -def to_nfc(text: str) -> str: - """Normalize a Unicode string to NFC form C. - Form C favors the use of a fully combined character. - - Args: - text (str): String to normalize. - - Returns: - str: Normalized string. - """ - return unicodedata.normalize("NFC", text) +def _strings_as_exp2(texts: Iterable[str]) -> str: + return r"|".join(map(_string_as_exp2, iter_sort_by_len(texts, reverse=True))) diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index d21f955..e67ec4b 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -3,19 +3,16 @@ This module contains constant values used throughout the project. """ -import string from typing import Final __all__ = [ - "ALPHA_CHARS", - "DIGIT_CHARTS", "SAFE_CHARS", - "RE2_ESCAPABLE_CHARS", + "ESCAPE_CHARS", ] -ALPHA_CHARS: Final[set[str]] = set(string.ascii_letters) -DIGIT_CHARTS: Final[set[str]] = set(string.digits) -SAFE_CHARS: Final[set[str]] = ALPHA_CHARS.union(DIGIT_CHARTS).union( - set(string.whitespace) +SAFE_CHARS: Final[frozenset[str]] = frozenset( + map(chr, b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") +) +ESCAPE_CHARS: Final[frozenset[str]] = frozenset( + map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) -RE2_ESCAPABLE_CHARS: Final[set[str]] = set(string.punctuation) diff --git a/src/regex_toolkit/enums.py b/src/regex_toolkit/enums.py new file mode 100644 index 0000000..55a9b9d --- /dev/null +++ b/src/regex_toolkit/enums.py @@ -0,0 +1,15 @@ +"""Enums.""" + +from enum import Enum + + +class RegexFlavor(int, Enum): + """Regex flavors. + + Attributes: + RE (int): Standard Python regex flavor. + RE2 (int): Google RE2 regex flavor. + """ + + RE = 1 + RE2 = 2 diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py new file mode 100644 index 0000000..934d49c --- /dev/null +++ b/src/regex_toolkit/utils.py @@ -0,0 +1,208 @@ +import unicodedata +from collections.abc import Generator, Iterable + +__all__ = [ + "iter_sort_by_len", + "sort_by_len", + "ord_to_cpoint", + "cpoint_to_ord", + "char_to_cpoint", + "to_utf8", + "to_nfc", + "iter_char_range", + "char_range", + "mask_span", + "mask_spans", +] + + +def iter_sort_by_len( + texts: Iterable[str], + *, + reverse: bool = False, +) -> Generator[str, None, None]: + """Iterate strings sorted by length. + + Args: + texts (Iterable[str]): Strings to sort. + reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. + + Yields: + str: Strings sorted by length. + """ + for text in sorted(texts, key=len, reverse=reverse): + yield text + + +def sort_by_len( + texts: Iterable[str], + *, + reverse: bool = False, +) -> tuple[str, ...]: + """Sort strings by length. + + Args: + texts (Iterable[str]): Strings to sort. + reverse (bool, optional): Sort in descending order (longest to shortest). Defaults to False. + + Returns: + tuple[str, ...]: Strings sorted by length. + """ + return tuple(iter_sort_by_len(texts, reverse=reverse)) + + +def ord_to_cpoint(ordinal: int) -> str: + """Character codepoint from character ordinal. + + Example: + + ```python + # Output: '00000061' + ord_to_cpoint(97) + ``` + + Args: + ordinal (int): Character ordinal. + + Returns: + str: Character codepoint. + """ + return format(ordinal, "x").zfill(8) + + +def cpoint_to_ord(cpoint: str) -> int: + """Character ordinal from character codepoint. + + Args: + cpoint (str): Character codepoint. + + Returns: + int: Character ordinal. + """ + return int(cpoint, 16) + + +def char_to_cpoint(char: str) -> str: + """Character codepoint from character. + + Example: + + ```python + # Output: '00000061' + char_to_cpoint("a") + ``` + + Args: + char (str): Character. + + Returns: + str: Character codepoint. + """ + return ord_to_cpoint(ord(char)) + + +def to_utf8(text): + return text.encode("utf-8").decode("utf-8") + + +def to_nfc(text: str) -> str: + """Normalize a Unicode string to NFC form C. + + Form C favors the use of a fully combined character. + + Args: + text (str): String to normalize. + + Returns: + str: Normalized string. + """ + return unicodedata.normalize("NFC", text) + + +def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]: + """Iterate all character within a range of codepoints (inclusive). + + Args: + first_cpoint (int): Starting (first) codepoint. + last_cpoint (int): Ending (last) codepoint. + + Yields: + str: Character from within a range of codepoints. + """ + for i in range(ord(first_cpoint), ord(last_cpoint) + 1): + yield chr(i) + + +def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]: + """Tuple of all character within a range of codepoints (inclusive). + + Args: + first_cpoint (int): Starting (first) codepoint. + last_cpoint (int): Ending (last) codepoint. + + Returns: + tuple[str, ...]: Characters within a range of codepoints. + """ + return tuple(iter_char_range(first_cpoint, last_cpoint)) + + +def mask_span( + text: str, + span: list[int] | tuple[int, int], + mask: str | None = None, +) -> str: + """Slice and mask a string using a single span. + + Args: + text (str): String to slice. + span (list[int] | tuple[int, int]): Domain of index positions (start, end) to mask. + mask (str, optional): Mask to insert after slicing. Defaults to None. + + Returns: + str: String with span replaced with the mask text. + + Raises: + ValueError: Invalid index positions for start and end. + """ + if not 0 <= span[0] <= span[1] <= len(text): + raise ValueError(f"Invalid index positions for start and end: {span}") + if mask is None: + # No mask + return text[: span[0]] + text[span[1] :] + else: + # Use mask + return text[: span[0]] + mask + text[span[1] :] + + +def mask_spans( + text: str, + spans: Iterable[list[int] | tuple[int, int]], + masks: Iterable[str] | None = None, +) -> str: + """Slice and mask a string using multiple spans. + + Todo: Add support for overlapping (and unordered?) spans. + + Args: + text (str): String to slice. + spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask from the text. + masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. + + Returns: + str: String with all spans replaced with the mask text. + + Raises: + ValueError: Invalid index positions for start and end. + """ + try: + if masks is None: + # No masks + for span in reversed(spans): + text = mask_span(text, span, mask=None) + else: + # Has mask + for span, mask in zip(reversed(spans), reversed(masks)): + text = mask_span(text, span, mask=mask) + except ValueError as err: + raise err + return text diff --git a/tests/test_functions.py b/tests/test_functions.py index a0afba8..7862ae1 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,14 +1,40 @@ +import re import unittest -from collections.abc import Iterable -from itertools import product +from collections.abc import Iterable # Generator +from itertools import combinations_with_replacement # product import regex_toolkit - - -class TestStringMethods(unittest.TestCase): - def test_iter_sort_by_len(self): - # Words used during test - texts = { +from regex_toolkit.constants import ESCAPE_CHARS, SAFE_CHARS +from regex_toolkit.enums import RegexFlavor + + +def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: + prev_len = None + for text in texts: + if prev_len is None: + prev_len = len(text) + if reverse: + if len(text) > prev_len: + return False + else: + if len(text) < prev_len: + return False + prev_len = len(text) + return True + + +class TestRegexFlavor(unittest.TestCase): + def test_flavor(self): + for flavor in RegexFlavor: + with self.subTest(flavor=flavor): + self.assertIsInstance(flavor, RegexFlavor) + self.assertIsInstance(flavor.name, str) + self.assertIsInstance(flavor.value, int) + + +class TestUtils(unittest.TestCase): + def setUp(self) -> None: + self.texts = { "apple", "orange", "banana", @@ -20,299 +46,482 @@ def test_iter_sort_by_len(self): "strawberry", "blackberry", } + self.texts_by_type = ( + (set, self.texts), + (frozenset, frozenset(self.texts)), + (tuple, tuple(self.texts)), + (list, list(self.texts)), + (dict, dict.fromkeys(self.texts, None)), + ) - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - dict: dict.fromkeys(texts, None), - }.items(): - # Not reversed (shortest to longest) - result = regex_toolkit.iter_sort_by_len(texts_as_try_type, reverse=False) - - # Returns a iterable (allows for duplicate entries) - self.assertIsInstance(result, Iterable) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(tuple(result))) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertGreaterEqual(len(text), prev_len) - - prev_len = len(text) - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - }.items(): - # Not reversed (longest to shortest) - result = regex_toolkit.iter_sort_by_len(texts_as_try_type, reverse=True) - - # Returns a iterable (allows for duplicate entries) - self.assertIsInstance(result, Iterable) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(tuple(result))) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertLessEqual(len(text), prev_len) - - prev_len = len(text) + def test_iter_sort_by_len(self): + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, typed_texts=typed_texts, reverse=reverse + ): + result = regex_toolkit.iter_sort_by_len( + typed_texts, reverse=reverse + ) + self.assertIsInstance(result, Iterable) + result_tuple = tuple(result) + self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) + self.assertEqual(len(result_tuple), len(typed_texts)) def test_sort_by_len(self): - # Words used during test - texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - dict: dict.fromkeys(texts, None), - }.items(): - # Not reversed (shortest to longest) - result = regex_toolkit.sort_by_len(texts_as_try_type, reverse=False) - - # Returns a tuple (allows for duplicate entries) - self.assertIsInstance(result, tuple) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(result)) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertGreaterEqual(len(text), prev_len) - - prev_len = len(text) - - # Run test using different iterable types - for try_type, texts_as_try_type in { - set: texts, - Iterable: iter(texts), - tuple: tuple(texts), - list: list(texts), - }.items(): - # Not reversed (longest to shortest) - result = regex_toolkit.sort_by_len(texts_as_try_type, reverse=True) - - # Returns a tuple (allows for duplicate entries) - self.assertIsInstance(result, tuple) - - # Result should have a equal number of texts - self.assertEqual(len(texts), len(result)) - - prev_len = None - for text in result: - if prev_len is not None: - self.assertLessEqual(len(text), prev_len) - - prev_len = len(text) - - def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit.constants.SAFE_CHARS) - actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit.constants.SAFE_CHARS) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) - actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp_safe_chars(self): - text = "".join(regex_toolkit.constants.SAFE_CHARS) - actual_exp = regex_toolkit.string_as_exp(text) - expected_exp = "".join(regex_toolkit.constants.SAFE_CHARS) - self.assertEqual(actual_exp, expected_exp) - - def test_string_as_exp2_escapable_chars(self): - text = "".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) - actual_exp = regex_toolkit.string_as_exp2(text) - expected_exp = "\\" + "\\".join(regex_toolkit.constants.RE2_ESCAPABLE_CHARS) - self.assertEqual(actual_exp, expected_exp) + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, typed_texts=typed_texts, reverse=reverse + ): + result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) + self.assertIsInstance(result, tuple) + self.assertTrue(is_sorted_by_len(result, reverse=reverse)) + self.assertEqual(len(result), len(typed_texts)) def test_iter_char_range(self): result = regex_toolkit.iter_char_range("a", "z") - - # Returns a iterable (no duplicate entries) self.assertIsInstance(result, Iterable) - - # Validate output - actual_char_range = tuple(result) - excpected_char_range = tuple("abcdefghijklmnopqrstuvwxyz") - self.assertEqual(actual_char_range, excpected_char_range) + self.assertTupleEqual(tuple(result), tuple("abcdefghijklmnopqrstuvwxyz")) def test_char_range(self): - result = regex_toolkit.char_range("a", "z") + self.assertEqual( + regex_toolkit.char_range("a", "z"), tuple("abcdefghijklmnopqrstuvwxyz") + ) - # Returns a tuple (no duplicate entries) - self.assertIsInstance(result, tuple) - # Validate output - actual_char_range = result - excpected_char_range = tuple("abcdefghijklmnopqrstuvwxyz") - self.assertEqual(actual_char_range, excpected_char_range) +class TestMasking(unittest.TestCase): + def setUp(self): + self.text = "This is an example" - def test_mask_span(self): - text = "This is an example" - - # Run test using different acceptable sequence types + def test_insert_word(self): indexes = (8, 8) - for try_type, indexes_as_try_type in { - tuple: indexes, - list: list(indexes), - }.items(): - actual_text = regex_toolkit.mask_span(text, indexes_as_try_type, "not ") - expected_text = "This is not an example" - self.assertEqual(actual_text, expected_text) - - # Run test using different acceptable sequence types + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "not "), + "This is not an example", + ) + + def test_replace_word(self): indexes = (5, 7) - for try_type, indexes_as_try_type in { - tuple: indexes, - list: list(indexes), - }.items(): - actual_text = regex_toolkit.mask_span(text, indexes_as_try_type, "isn't") - expected_text = "This isn't an example" - self.assertEqual(actual_text, expected_text) - - def test_char_as_exp(self): + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "isn't"), + "This isn't an example", + ) + + +class TestEscapeRE(unittest.TestCase): + def test_only_safe(self): + for char in SAFE_CHARS: + with self.subTest(char=char): + self.assertEqual(regex_toolkit.escape(char, RegexFlavor.RE), char) + + def test_only_escapable_chars(self): + for char in ESCAPE_CHARS: + with self.subTest(char=char): + char_exp = regex_toolkit.escape(char, RegexFlavor.RE) + self.assertEqual(char_exp, f"\\{char}") + # Compile and test the expression. + char_regex = re.compile(char_exp) + self.assertTrue(char_regex.match(char)) + + def test_only_unknown_chars(self): + # TODO: Include additional characters to test. + for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": + with self.subTest(char=char): + expression = regex_toolkit.escape(char, RegexFlavor.RE) + self.assertEqual(expression, f"\\{char}") + # Compile and match the expression. + regex = re.compile(r"^" + expression + r"$") + self.assertTrue(regex.match(char)) + + +class TestEscapeRE2(unittest.TestCase): + def test_only_safe(self): + for char in SAFE_CHARS: + with self.subTest(char=char): + self.assertEqual(regex_toolkit.escape(char, RegexFlavor.RE), char) + + def test_only_escapable_chars(self): + for char in ESCAPE_CHARS: + with self.subTest(char=char): + self.assertEqual( + regex_toolkit.escape(char, RegexFlavor.RE2), + f"\\{char}", + ) + + def test_only_unknown_chars(self): + # TODO: Include additional characters to test. for char, expected_exp in ( - ("s", "s"), - ("d", "d"), - ("\n", "\n"), - (".", "\\."), - ("!", "\\!"), - ("?", "\\?"), - ("πŸ…°", "\\πŸ…°"), - ): - actual_exp = regex_toolkit.char_as_exp(char) - self.assertEqual(actual_exp, expected_exp) - - def test_char_as_exp2(self): - for char, expected_exp in ( - ("s", "s"), - ("d", "d"), - ("\n", "\n"), - (".", "\\."), - ("!", "\\!"), - ("?", "\\?"), + # Length 1 ("πŸ…°", r"\x{0001f170}"), + ("πŸ…±", r"\x{0001f171}"), + ("πŸ…Ύ", r"\x{0001f17e}"), + ("πŸ…Ώ", r"\x{0001f17f}"), + ("πŸ†Ž", r"\x{0001f18e}"), + ("πŸ†‘", r"\x{0001f191}"), + ("πŸ†’", r"\x{0001f192}"), + ("πŸ†“", r"\x{0001f193}"), + ("πŸ†”", r"\x{0001f194}"), + ("πŸ†•", r"\x{0001f195}"), + ("πŸ†–", r"\x{0001f196}"), + ("πŸ†—", r"\x{0001f197}"), + ("πŸ†˜", r"\x{0001f198}"), + ("πŸ†™", r"\x{0001f199}"), + ("πŸ†š", r"\x{0001f19a}"), + ("πŸ‡¦", r"\x{0001f1e6}"), + ("πŸ‡§", r"\x{0001f1e7}"), + ("πŸ‡¨", r"\x{0001f1e8}"), + ("🈁", r"\x{0001f201}"), + ("πŸˆ‚", r"\x{0001f202}"), + ("🈚", r"\x{0001f21a}"), + ("🈯", r"\x{0001f22f}"), + ("🈲", r"\x{0001f232}"), + ("🈳", r"\x{0001f233}"), + ("🈴", r"\x{0001f234}"), + ("🈡", r"\x{0001f235}"), + ("🈢", r"\x{0001f236}"), + ("🈷", r"\x{0001f237}"), + ("🈸", r"\x{0001f238}"), + ("🈹", r"\x{0001f239}"), + ("🈺", r"\x{0001f23a}"), + ("πŸ‰", r"\x{0001f250}"), + ("πŸ‰‘", r"\x{0001f251}"), + ("πŸŒ€", r"\x{0001f300}"), + ("🌁", r"\x{0001f301}"), + ("πŸŒ‚", r"\x{0001f302}"), + ("πŸŒƒ", r"\x{0001f303}"), + ("πŸŒ„", r"\x{0001f304}"), + # Length 2 + ("πŸŒ…", r"\x{0001f305}"), ): - actual_exp = regex_toolkit.char_as_exp2(char) - self.assertEqual(actual_exp, expected_exp) - - def test_strings_as_exp(self): - # Alphanumeric single char and multi-char combos - for i in range(4): - for char_tuple in product(i * ["a", "b", "0", "1"]): - actual_exp = regex_toolkit.strings_as_exp(char_tuple) - expected_exp = "|".join(char_tuple) - self.assertEqual(actual_exp, expected_exp) - - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'd', not r'\d' - for text in {r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"}: - actual_exp = regex_toolkit.strings_as_exp([text]) - expected_exp = f"\\{text}" - self.assertEqual(actual_exp, expected_exp) - - # Single whitespace char - for text in {"\n", "\v", "\t", "\r", "\f", "\v"}: - actual_exp = regex_toolkit.strings_as_exp([text]) - expected_exp = text - self.assertEqual(actual_exp, expected_exp) - - # Single unsafe char - for texts, expected_exp in [ - (["."], "\\."), - (["!"], "\\!"), - (["?"], "\\?"), - ]: - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - # Multiple unsafe char - texts = [".", "!", "?"] - expected_exp = "\\.|\\!|\\?" - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - for texts, expected_exp in [ - (["πŸ…°"], "\\πŸ…°"), - (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), - (["alpha", "beta"], "alpha|beta"), - (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), - (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), - ]: - actual_exp = regex_toolkit.strings_as_exp(texts) - self.assertEqual(actual_exp, expected_exp) - - def test_strings_as_exp2(self): - # Alphanumeric single char and multi-char combos - for i in range(4): - for char_tuple in product(i * ["a", "b", "0", "1"]): - actual_exp = regex_toolkit.strings_as_exp2(char_tuple) - expected_exp = "|".join(char_tuple) - self.assertEqual(actual_exp, expected_exp) - - # Exact matches that equate to reserved spaces - # E.g. Should match '\\' + 'd', not r'\d' - for text in {r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"}: - actual_exp = regex_toolkit.strings_as_exp2([text]) - expected_exp = f"\\{text}" - self.assertEqual(actual_exp, expected_exp) - - # Single whitespace char - for text in {"\n", "\v", "\t", "\r", "\f", "\v"}: - actual_exp = regex_toolkit.strings_as_exp2([text]) - expected_exp = text - self.assertEqual(actual_exp, expected_exp) - - # Single unsafe char - for texts, expected_exp in [ - (["."], "\\."), - (["!"], "\\!"), - (["?"], "\\?"), - ]: - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) - - # Multiple unsafe char - texts = [".", "!", "?"] - expected_exp = "\\.|\\!|\\?" - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) - - for texts, expected_exp in [ - (["πŸ…°"], "\\x{0001f170}"), - (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), - (["alpha", "beta"], "alpha|beta"), - (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), - (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), - ]: - actual_exp = regex_toolkit.strings_as_exp2(texts) - self.assertEqual(actual_exp, expected_exp) + with self.subTest(char=char, expected_exp=expected_exp): + self.assertEqual( + regex_toolkit.escape(char, RegexFlavor.RE2), + expected_exp, + ) + + +class TestStringAsExpressionRE(unittest.TestCase): + # TODO: Add tests for mix of characters. + def test_safe(self): + text = "".join(SAFE_CHARS) + self.assertEqual(regex_toolkit.string_as_exp(text, RegexFlavor.RE), text) + + def test_escapable(self): + text = "".join(ESCAPE_CHARS) + self.assertEqual( + regex_toolkit.string_as_exp(text, RegexFlavor.RE), + "".join(f"\\{char}" for char in ESCAPE_CHARS), + ) + + def test_unknown(self): + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + self.assertEqual( + regex_toolkit.string_as_exp(text, RegexFlavor.RE), + "".join(f"\\{char}" for char in text), + ) + + +class TestStringAsExpressionRE2(unittest.TestCase): + # TODO: Add tests for mix of characters. + def test_only_safe(self): + text = "".join(SAFE_CHARS) + self.assertEqual( + regex_toolkit.string_as_exp(text, RegexFlavor.RE2), + "".join(SAFE_CHARS), + ) + + def test_only_escapable_chars(self): + text = "".join(ESCAPE_CHARS) + self.assertEqual( + regex_toolkit.string_as_exp(text, RegexFlavor.RE2), + "".join(f"\\{char}" for char in ESCAPE_CHARS), + ) + + def test_only_unknown_chars(self): + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + self.assertEqual( + regex_toolkit.string_as_exp(text, RegexFlavor.RE2), + r"".join( + ( + r"\x{0001f170}", + r"\x{0001f171}", + r"\x{0001f17e}", + r"\x{0001f17f}", + r"\x{0001f18e}", + r"\x{0001f191}", + r"\x{0001f192}", + r"\x{0001f193}", + r"\x{0001f194}", + r"\x{0001f195}", + r"\x{0001f196}", + r"\x{0001f197}", + r"\x{0001f198}", + r"\x{0001f199}", + r"\x{0001f19a}", + r"\x{0001f1e6}", + r"\x{0001f1e7}", + r"\x{0001f1e8}", + r"\x{0001f201}", + r"\x{0001f202}", + r"\x{0001f21a}", + r"\x{0001f22f}", + r"\x{0001f232}", + r"\x{0001f233}", + r"\x{0001f234}", + r"\x{0001f235}", + r"\x{0001f236}", + r"\x{0001f237}", + r"\x{0001f238}", + r"\x{0001f239}", + r"\x{0001f23a}", + r"\x{0001f250}", + r"\x{0001f251}", + r"\x{0001f300}", + r"\x{0001f301}", + r"\x{0001f302}", + r"\x{0001f303}", + r"\x{0001f304}", + # Length 2 + r"\x{0001f305}", + ) + ), + ) + + +class StringsAsExpressionRE(unittest.TestCase): + def test_only_safe(self): + # Unique combinations of SAFE_CHARS using various lengths (1-4). + # elements = tuple(SAFE_CHARS) + elements = SAFE_CHARS + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), + "|".join(texts), + ) + + def test_only_escapable_chars(self): + # Unique combinations of ESCAPE_CHARS using various lengths (1-4). + # elements = tuple(ESCAPE_CHARS) + elements = ESCAPE_CHARS + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), + "|".join(f"\\{text}" for text in texts), + ) + + def test_reserved_only(self): + # Unique combinations of reserved expressions using various lengths (1-4). + elements = ( + r"\A", + r"\b", + r"\B", + r"\d", + r"\D", + r"\s", + r"\S", + r"\w", + r"\W", + r"\Z", + r"\1", + ) + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), + "|".join(f"\\{text}" for text in texts), + ) + + def test_unsafe_only(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of UNSAFE_CHARS using various lengths (1-4). + # elements = tuple(UNSAFE_CHARS) + elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), + "|".join(f"\\{text}" for text in texts), + ) + + +###################### +###################### # Multiple unsafe char +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp([".", "!", "?"], RegexFlavor.RE), +###################### "\\.|\\!|\\?", +###################### ) +###################### +###################### for texts, expected_exp in [ +###################### (["πŸ…°"], "\\πŸ…°"), +###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), +###################### (["alpha", "beta"], "alpha|beta"), +###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), +###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), +###################### ]: +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), +###################### expected_exp, +###################### ) + + +class StringsAsExpressionRE2(unittest.TestCase): + def test_only_safe(self): + # Unique combinations of SAFE_CHARS using various lengths (1-4). + # elements = tuple(SAFE_CHARS) + elements = SAFE_CHARS + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), + "|".join(texts), + ) + + def test_only_escapable_chars(self): + # Unique combinations of ESCAPE_CHARS using various lengths (1-4). + # elements = tuple(ESCAPE_CHARS) + elements = ESCAPE_CHARS + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), + "|".join(f"\\{text}" for text in texts), + ) + + def test_reserved_only(self): + # Unique combinations of reserved expressions using various lengths (1-4). + elements = ( + r"\A", + r"\b", + r"\B", + r"\d", + r"\D", + r"\s", + r"\S", + r"\w", + r"\W", + r"\Z", + r"\1", + ) + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), + "|".join(f"\\{text}" for text in texts), + ) + + def test_unsafe_only(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of UNSAFE_CHARS using various lengths (1-4). + # elements = tuple(UNSAFE_CHARS) + elements_map = { + # Length 1 + "πŸ…°": r"\x{0001f170}", + "πŸ…±": r"\x{0001f171}", + "πŸ…Ύ": r"\x{0001f17e}", + "πŸ…Ώ": r"\x{0001f17f}", + "πŸ†Ž": r"\x{0001f18e}", + "πŸ†‘": r"\x{0001f191}", + "πŸ†’": r"\x{0001f192}", + "πŸ†“": r"\x{0001f193}", + "πŸ†”": r"\x{0001f194}", + "πŸ†•": r"\x{0001f195}", + "πŸ†–": r"\x{0001f196}", + "πŸ†—": r"\x{0001f197}", + "πŸ†˜": r"\x{0001f198}", + "πŸ†™": r"\x{0001f199}", + "πŸ†š": r"\x{0001f19a}", + "πŸ‡¦": r"\x{0001f1e6}", + "πŸ‡§": r"\x{0001f1e7}", + "πŸ‡¨": r"\x{0001f1e8}", + "🈁": r"\x{0001f201}", + "πŸˆ‚": r"\x{0001f202}", + "🈚": r"\x{0001f21a}", + "🈯": r"\x{0001f22f}", + "🈲": r"\x{0001f232}", + "🈳": r"\x{0001f233}", + "🈴": r"\x{0001f234}", + "🈡": r"\x{0001f235}", + "🈢": r"\x{0001f236}", + "🈷": r"\x{0001f237}", + "🈸": r"\x{0001f238}", + "🈹": r"\x{0001f239}", + "🈺": r"\x{0001f23a}", + "πŸ‰": r"\x{0001f250}", + "πŸ‰‘": r"\x{0001f251}", + "πŸŒ€": r"\x{0001f300}", + "🌁": r"\x{0001f301}", + "πŸŒ‚": r"\x{0001f302}", + "πŸŒƒ": r"\x{0001f303}", + "πŸŒ„": r"\x{0001f304}", + # Length 2 + "πŸŒ…": r"\x{0001f305}", + } + elements = tuple(elements_map) + for i in range(1, 5): + for texts in combinations_with_replacement(elements, i): + with self.subTest(texts=texts): + self.assertEqual( + regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), + "|".join(elements_map[text] for text in texts), + ) + + +##############################3 # Exact matches that equate to reserved spaces +##############################3 # E.g. Should match '\\' + 'd', not r'\d' +##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): +##############################3 texts = [text] +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), +##############################3 f"\\{text}", +##############################3 ) +##############################3 +##############################3 # Single whitespace char +##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), texts[0]) +##############################3 +##############################3 # Single unsafe char +##############################3 for texts, expected_exp in [ +##############################3 (["."], "\\."), +##############################3 (["!"], "\\!"), +##############################3 (["?"], "\\?"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), +##############################3 expected_exp, +##############################3 ) +##############################3 +##############################3 # Multiple unsafe char +##############################3 texts = [".", "!", "?"] +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), "\\.|\\!|\\?") +##############################3 +##############################3 for texts, expected_exp in [ +##############################3 (["πŸ…°"], "\\x{0001f170}"), +##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), +##############################3 (["alpha", "beta"], "alpha|beta"), +##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), +##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), +##############################3 expected_exp, +##############################3 ) + +# TODO: Add tests for actually compiling the e. From 71414871d6cc94c26e01cdd727551c0d8ff49119 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 12 Mar 2023 16:47:44 -0500 Subject: [PATCH 4/7] Move utils tests and base tests to own files and improved both. Better names for constant variables. --- src/regex_toolkit/base.py | 8 +- src/regex_toolkit/constants.py | 15 +- tests/.gitignore | 3 +- tests/test_base.py | 489 ++++++++++++++++++++++++++++++ tests/test_functions.py | 527 --------------------------------- tests/test_utils.py | 119 ++++++++ 6 files changed, 623 insertions(+), 538 deletions(-) create mode 100644 tests/test_base.py delete mode 100644 tests/test_functions.py create mode 100644 tests/test_utils.py diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index 7c17b18..a8e0508 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -5,7 +5,7 @@ ] from collections.abc import Iterable -from regex_toolkit.constants import ESCAPE_CHARS, SAFE_CHARS +from regex_toolkit.constants import ALWAYS_ESCAPED, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len @@ -36,7 +36,7 @@ def escape(char: str, flavor: int = 1) -> str: def _escape(char: str) -> str: - if char in SAFE_CHARS: + if char in ALWAYS_SAFE: # Safe as-is return char else: @@ -45,10 +45,10 @@ def _escape(char: str) -> str: def _escape2(char: str) -> str: - if char in SAFE_CHARS: + if char in ALWAYS_SAFE: # Safe as-is return char - elif char in ESCAPE_CHARS: + elif char in ALWAYS_ESCAPED: # Safe to escape with backslash return f"\\{char}" else: diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index e67ec4b..74e42b0 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -6,13 +6,16 @@ from typing import Final __all__ = [ - "SAFE_CHARS", - "ESCAPE_CHARS", + "ALWAYS_SAFE", + "ALWAYS_ESCAPED", + "ASCIILETTERS", + "DIGITS", ] - -SAFE_CHARS: Final[frozenset[str]] = frozenset( - map(chr, b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") +DIGITS: Final[frozenset[str]] = frozenset("0123456789") +ASCIILETTERS: Final[frozenset[str]] = frozenset( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" ) -ESCAPE_CHARS: Final[frozenset[str]] = frozenset( +ALWAYS_SAFE: Final[frozenset[str]] = frozenset() | DIGITS | ASCIILETTERS +ALWAYS_ESCAPED: Final[frozenset[str]] = frozenset( map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) diff --git a/tests/.gitignore b/tests/.gitignore index be87017..98d7e95 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,6 +2,7 @@ * # Except -!test_functions.py +!test_base.py +!test_utils.py !.gitignore diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..87af87c --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,489 @@ +import re +import unittest +from itertools import product + +import re2 + +import regex_toolkit +from regex_toolkit.constants import ALWAYS_ESCAPED, ALWAYS_SAFE +from regex_toolkit.enums import RegexFlavor + + +class TestEscapeRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + + def test_safe_chars(self): + for char in ALWAYS_SAFE: + with self.subTest(char=char): + expected_exp = char + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_escapable_chars(self): + for char in ALWAYS_ESCAPED: + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_unknown_chars(self): + # TODO: Include additional characters to test. + for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + +class TestEscapeRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + + def test_safe_chars(self): + for char in ALWAYS_SAFE: + with self.subTest(char=char): + expected_exp = char + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_escapable_chars(self): + for char in ALWAYS_ESCAPED: + with self.subTest(char=char): + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + def test_unknown_chars(self): + # TODO: Include additional characters to test. + for char, expected_exp in ( + # Length 1 + ("πŸ…°", r"\x{0001f170}"), + ("πŸ…±", r"\x{0001f171}"), + ("πŸ…Ύ", r"\x{0001f17e}"), + ("πŸ…Ώ", r"\x{0001f17f}"), + ("πŸ†Ž", r"\x{0001f18e}"), + ("πŸ†‘", r"\x{0001f191}"), + ("πŸ†’", r"\x{0001f192}"), + ("πŸ†“", r"\x{0001f193}"), + ("πŸ†”", r"\x{0001f194}"), + ("πŸ†•", r"\x{0001f195}"), + ("πŸ†–", r"\x{0001f196}"), + ("πŸ†—", r"\x{0001f197}"), + ("πŸ†˜", r"\x{0001f198}"), + ("πŸ†™", r"\x{0001f199}"), + ("πŸ†š", r"\x{0001f19a}"), + ("πŸ‡¦", r"\x{0001f1e6}"), + ("πŸ‡§", r"\x{0001f1e7}"), + ("πŸ‡¨", r"\x{0001f1e8}"), + ("🈁", r"\x{0001f201}"), + ("πŸˆ‚", r"\x{0001f202}"), + ("🈚", r"\x{0001f21a}"), + ("🈯", r"\x{0001f22f}"), + ("🈲", r"\x{0001f232}"), + ("🈳", r"\x{0001f233}"), + ("🈴", r"\x{0001f234}"), + ("🈡", r"\x{0001f235}"), + ("🈢", r"\x{0001f236}"), + ("🈷", r"\x{0001f237}"), + ("🈸", r"\x{0001f238}"), + ("🈹", r"\x{0001f239}"), + ("🈺", r"\x{0001f23a}"), + ("πŸ‰", r"\x{0001f250}"), + ("πŸ‰‘", r"\x{0001f251}"), + ("πŸŒ€", r"\x{0001f300}"), + ("🌁", r"\x{0001f301}"), + ("πŸŒ‚", r"\x{0001f302}"), + ("πŸŒƒ", r"\x{0001f303}"), + ("πŸŒ„", r"\x{0001f304}"), + # Length 2 + ("πŸŒ…", r"\x{0001f305}"), + ): + with self.subTest(char=char): + actual_exp = regex_toolkit.escape(char, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the character. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(char)) + + +class TestStringAsExpressionRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + + # TODO: Add tests for mix of characters. + def test_safe_chars(self): + text = "".join(ALWAYS_SAFE) + expected_exp = text + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_chars(self): + text = "".join(ALWAYS_ESCAPED) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPED) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_unknown_chars(self): + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + expected_exp = "".join(f"\\{char}" for char in text) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + +class TestStringAsExpressionRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + + # TODO: Add tests for mix of characters. + def test_safe_chars(self): + text = "".join(ALWAYS_SAFE) + expected_exp = "".join(ALWAYS_SAFE) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_chars(self): + text = "".join(ALWAYS_ESCAPED) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPED) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_unknown_chars(self): + text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + expected_exp = r"".join( + ( + r"\x{0001f170}", + r"\x{0001f171}", + r"\x{0001f17e}", + r"\x{0001f17f}", + r"\x{0001f18e}", + r"\x{0001f191}", + r"\x{0001f192}", + r"\x{0001f193}", + r"\x{0001f194}", + r"\x{0001f195}", + r"\x{0001f196}", + r"\x{0001f197}", + r"\x{0001f198}", + r"\x{0001f199}", + r"\x{0001f19a}", + r"\x{0001f1e6}", + r"\x{0001f1e7}", + r"\x{0001f1e8}", + r"\x{0001f201}", + r"\x{0001f202}", + r"\x{0001f21a}", + r"\x{0001f22f}", + r"\x{0001f232}", + r"\x{0001f233}", + r"\x{0001f234}", + r"\x{0001f235}", + r"\x{0001f236}", + r"\x{0001f237}", + r"\x{0001f238}", + r"\x{0001f239}", + r"\x{0001f23a}", + r"\x{0001f250}", + r"\x{0001f251}", + r"\x{0001f300}", + r"\x{0001f301}", + r"\x{0001f302}", + r"\x{0001f303}", + r"\x{0001f304}", + # Length 2 + r"\x{0001f305}", + ) + ) + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + +class StringsAsExpressionRE(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE + self._re_compile = re.compile + self._max_combo_length = 2 + + def test_safe_of_variable_length(self): + # Unique combinations of `ALWAYS_SAFE` using various lengths. + elements = set(ALWAYS_SAFE) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_escapable_of_variable_length(self): + # Unique combinations of `ALWAYS_ESCAPED` using various lengths. + elements = set(ALWAYS_ESCAPED) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_reserved_of_variable_length(self): + # Unique combinations of reserved expressions using various lengths. + elements = ( + r"\A", + r"\b", + r"\B", + r"\d", + r"\D", + r"\s", + r"\S", + r"\w", + r"\W", + r"\Z", + r"\1", + ) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_unsafe_of_variable_length(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of `ALWAYS_SAFE` using various lengths. + elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + +###################### +###################### # Multiple unsafe char +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp([".", "!", "?"], self._flavor), +###################### "\\.|\\!|\\?", +###################### ) +###################### +###################### for texts, expected_exp in [ +###################### (["πŸ…°"], "\\πŸ…°"), +###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), +###################### (["alpha", "beta"], "alpha|beta"), +###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), +###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), +###################### ]: +###################### self.assertEqual( +###################### regex_toolkit.strings_as_exp(texts, self._flavor), +###################### expected_exp, +###################### ) + + +class StringsAsExpressionRE2(unittest.TestCase): + def setUp(self): + self._flavor = RegexFlavor.RE2 + self._re_compile = re2.compile + self._max_combo_length = 2 + + def test_safe_of_variable_length(self): + # Unique combinations of ALWAYS_SAFE using various lengths. + elements = set(ALWAYS_SAFE) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_escapable_of_variable_length(self): + # Unique combinations of ALWAYS_ESCAPED using various lengths. + elements = set(ALWAYS_ESCAPED) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_reserved_of_variable_length(self): + # Unique combinations of reserved expressions using various lengths. + elements = ( + r"\A", + r"\b", + r"\B", + r"\d", + r"\D", + r"\s", + r"\S", + r"\w", + r"\W", + r"\Z", + r"\1", + ) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(f"\\{text}" for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + def test_unsafe_of_variable_length(self): + # TODO: Include text/chars such as punctuation, etc. + # Unique combinations of ALWAYS_SAFE using various lengths. + elements_map = { + # Length 1 + "πŸ…°": r"\x{0001f170}", + "πŸ…±": r"\x{0001f171}", + "πŸ…Ύ": r"\x{0001f17e}", + "πŸ…Ώ": r"\x{0001f17f}", + "πŸ†Ž": r"\x{0001f18e}", + "πŸ†‘": r"\x{0001f191}", + "πŸ†’": r"\x{0001f192}", + "πŸ†“": r"\x{0001f193}", + "πŸ†”": r"\x{0001f194}", + "πŸ†•": r"\x{0001f195}", + "πŸ†–": r"\x{0001f196}", + "πŸ†—": r"\x{0001f197}", + "πŸ†˜": r"\x{0001f198}", + "πŸ†™": r"\x{0001f199}", + "πŸ†š": r"\x{0001f19a}", + "πŸ‡¦": r"\x{0001f1e6}", + "πŸ‡§": r"\x{0001f1e7}", + "πŸ‡¨": r"\x{0001f1e8}", + "🈁": r"\x{0001f201}", + "πŸˆ‚": r"\x{0001f202}", + "🈚": r"\x{0001f21a}", + "🈯": r"\x{0001f22f}", + "🈲": r"\x{0001f232}", + "🈳": r"\x{0001f233}", + "🈴": r"\x{0001f234}", + "🈡": r"\x{0001f235}", + "🈢": r"\x{0001f236}", + "🈷": r"\x{0001f237}", + "🈸": r"\x{0001f238}", + "🈹": r"\x{0001f239}", + "🈺": r"\x{0001f23a}", + "πŸ‰": r"\x{0001f250}", + "πŸ‰‘": r"\x{0001f251}", + "πŸŒ€": r"\x{0001f300}", + "🌁": r"\x{0001f301}", + "πŸŒ‚": r"\x{0001f302}", + "πŸŒƒ": r"\x{0001f303}", + "πŸŒ„": r"\x{0001f304}", + # Length 2 + "πŸŒ…": r"\x{0001f305}", + } + elements = tuple(elements_map) + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join(elements_map[text] for text in texts) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + self.assertTrue(pattern.match(text)) + + +##############################3 # Exact matches that equate to reserved spaces +##############################3 # E.g. Should match '\\' + 'd', not r'\d' +##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): +##############################3 texts = [text] +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 f"\\{text}", +##############################3 ) +##############################3 +##############################3 # Single whitespace char +##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): +##############################3 with self.subTest(texts=texts): +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), texts[0]) +##############################3 +##############################3 # Single unsafe char +##############################3 for texts, expected_exp in [ +##############################3 (["."], "\\."), +##############################3 (["!"], "\\!"), +##############################3 (["?"], "\\?"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 expected_exp, +##############################3 ) +##############################3 +##############################3 # Multiple unsafe char +##############################3 texts = [".", "!", "?"] +##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, self._flavor), "\\.|\\!|\\?") +##############################3 +##############################3 for texts, expected_exp in [ +##############################3 (["πŸ…°"], "\\x{0001f170}"), +##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), +##############################3 (["alpha", "beta"], "alpha|beta"), +##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), +##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), +##############################3 ]: +##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): +##############################3 self.assertEqual( +##############################3 regex_toolkit.strings_as_exp(texts, self._flavor), +##############################3 expected_exp, +##############################3 ) + +# TODO: Add tests for actually compiling the e. diff --git a/tests/test_functions.py b/tests/test_functions.py deleted file mode 100644 index 7862ae1..0000000 --- a/tests/test_functions.py +++ /dev/null @@ -1,527 +0,0 @@ -import re -import unittest -from collections.abc import Iterable # Generator -from itertools import combinations_with_replacement # product - -import regex_toolkit -from regex_toolkit.constants import ESCAPE_CHARS, SAFE_CHARS -from regex_toolkit.enums import RegexFlavor - - -def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: - prev_len = None - for text in texts: - if prev_len is None: - prev_len = len(text) - if reverse: - if len(text) > prev_len: - return False - else: - if len(text) < prev_len: - return False - prev_len = len(text) - return True - - -class TestRegexFlavor(unittest.TestCase): - def test_flavor(self): - for flavor in RegexFlavor: - with self.subTest(flavor=flavor): - self.assertIsInstance(flavor, RegexFlavor) - self.assertIsInstance(flavor.name, str) - self.assertIsInstance(flavor.value, int) - - -class TestUtils(unittest.TestCase): - def setUp(self) -> None: - self.texts = { - "apple", - "orange", - "banana", - "grape", - "apricot", - "cherry", - "plum", - "blueberry", - "strawberry", - "blackberry", - } - self.texts_by_type = ( - (set, self.texts), - (frozenset, frozenset(self.texts)), - (tuple, tuple(self.texts)), - (list, list(self.texts)), - (dict, dict.fromkeys(self.texts, None)), - ) - - def test_iter_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, typed_texts=typed_texts, reverse=reverse - ): - result = regex_toolkit.iter_sort_by_len( - typed_texts, reverse=reverse - ) - self.assertIsInstance(result, Iterable) - result_tuple = tuple(result) - self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) - self.assertEqual(len(result_tuple), len(typed_texts)) - - def test_sort_by_len(self): - for try_type, typed_texts in self.texts_by_type: - for reverse in (False, True): - with self.subTest( - try_type=try_type, typed_texts=typed_texts, reverse=reverse - ): - result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) - self.assertIsInstance(result, tuple) - self.assertTrue(is_sorted_by_len(result, reverse=reverse)) - self.assertEqual(len(result), len(typed_texts)) - - def test_iter_char_range(self): - result = regex_toolkit.iter_char_range("a", "z") - self.assertIsInstance(result, Iterable) - self.assertTupleEqual(tuple(result), tuple("abcdefghijklmnopqrstuvwxyz")) - - def test_char_range(self): - self.assertEqual( - regex_toolkit.char_range("a", "z"), tuple("abcdefghijklmnopqrstuvwxyz") - ) - - -class TestMasking(unittest.TestCase): - def setUp(self): - self.text = "This is an example" - - def test_insert_word(self): - indexes = (8, 8) - for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): - with self.subTest(try_type=try_type, indexes=indexes): - self.assertEqual( - regex_toolkit.mask_span(self.text, typed_indexes, "not "), - "This is not an example", - ) - - def test_replace_word(self): - indexes = (5, 7) - for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): - with self.subTest(try_type=try_type, indexes=indexes): - self.assertEqual( - regex_toolkit.mask_span(self.text, typed_indexes, "isn't"), - "This isn't an example", - ) - - -class TestEscapeRE(unittest.TestCase): - def test_only_safe(self): - for char in SAFE_CHARS: - with self.subTest(char=char): - self.assertEqual(regex_toolkit.escape(char, RegexFlavor.RE), char) - - def test_only_escapable_chars(self): - for char in ESCAPE_CHARS: - with self.subTest(char=char): - char_exp = regex_toolkit.escape(char, RegexFlavor.RE) - self.assertEqual(char_exp, f"\\{char}") - # Compile and test the expression. - char_regex = re.compile(char_exp) - self.assertTrue(char_regex.match(char)) - - def test_only_unknown_chars(self): - # TODO: Include additional characters to test. - for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": - with self.subTest(char=char): - expression = regex_toolkit.escape(char, RegexFlavor.RE) - self.assertEqual(expression, f"\\{char}") - # Compile and match the expression. - regex = re.compile(r"^" + expression + r"$") - self.assertTrue(regex.match(char)) - - -class TestEscapeRE2(unittest.TestCase): - def test_only_safe(self): - for char in SAFE_CHARS: - with self.subTest(char=char): - self.assertEqual(regex_toolkit.escape(char, RegexFlavor.RE), char) - - def test_only_escapable_chars(self): - for char in ESCAPE_CHARS: - with self.subTest(char=char): - self.assertEqual( - regex_toolkit.escape(char, RegexFlavor.RE2), - f"\\{char}", - ) - - def test_only_unknown_chars(self): - # TODO: Include additional characters to test. - for char, expected_exp in ( - # Length 1 - ("πŸ…°", r"\x{0001f170}"), - ("πŸ…±", r"\x{0001f171}"), - ("πŸ…Ύ", r"\x{0001f17e}"), - ("πŸ…Ώ", r"\x{0001f17f}"), - ("πŸ†Ž", r"\x{0001f18e}"), - ("πŸ†‘", r"\x{0001f191}"), - ("πŸ†’", r"\x{0001f192}"), - ("πŸ†“", r"\x{0001f193}"), - ("πŸ†”", r"\x{0001f194}"), - ("πŸ†•", r"\x{0001f195}"), - ("πŸ†–", r"\x{0001f196}"), - ("πŸ†—", r"\x{0001f197}"), - ("πŸ†˜", r"\x{0001f198}"), - ("πŸ†™", r"\x{0001f199}"), - ("πŸ†š", r"\x{0001f19a}"), - ("πŸ‡¦", r"\x{0001f1e6}"), - ("πŸ‡§", r"\x{0001f1e7}"), - ("πŸ‡¨", r"\x{0001f1e8}"), - ("🈁", r"\x{0001f201}"), - ("πŸˆ‚", r"\x{0001f202}"), - ("🈚", r"\x{0001f21a}"), - ("🈯", r"\x{0001f22f}"), - ("🈲", r"\x{0001f232}"), - ("🈳", r"\x{0001f233}"), - ("🈴", r"\x{0001f234}"), - ("🈡", r"\x{0001f235}"), - ("🈢", r"\x{0001f236}"), - ("🈷", r"\x{0001f237}"), - ("🈸", r"\x{0001f238}"), - ("🈹", r"\x{0001f239}"), - ("🈺", r"\x{0001f23a}"), - ("πŸ‰", r"\x{0001f250}"), - ("πŸ‰‘", r"\x{0001f251}"), - ("πŸŒ€", r"\x{0001f300}"), - ("🌁", r"\x{0001f301}"), - ("πŸŒ‚", r"\x{0001f302}"), - ("πŸŒƒ", r"\x{0001f303}"), - ("πŸŒ„", r"\x{0001f304}"), - # Length 2 - ("πŸŒ…", r"\x{0001f305}"), - ): - with self.subTest(char=char, expected_exp=expected_exp): - self.assertEqual( - regex_toolkit.escape(char, RegexFlavor.RE2), - expected_exp, - ) - - -class TestStringAsExpressionRE(unittest.TestCase): - # TODO: Add tests for mix of characters. - def test_safe(self): - text = "".join(SAFE_CHARS) - self.assertEqual(regex_toolkit.string_as_exp(text, RegexFlavor.RE), text) - - def test_escapable(self): - text = "".join(ESCAPE_CHARS) - self.assertEqual( - regex_toolkit.string_as_exp(text, RegexFlavor.RE), - "".join(f"\\{char}" for char in ESCAPE_CHARS), - ) - - def test_unknown(self): - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - self.assertEqual( - regex_toolkit.string_as_exp(text, RegexFlavor.RE), - "".join(f"\\{char}" for char in text), - ) - - -class TestStringAsExpressionRE2(unittest.TestCase): - # TODO: Add tests for mix of characters. - def test_only_safe(self): - text = "".join(SAFE_CHARS) - self.assertEqual( - regex_toolkit.string_as_exp(text, RegexFlavor.RE2), - "".join(SAFE_CHARS), - ) - - def test_only_escapable_chars(self): - text = "".join(ESCAPE_CHARS) - self.assertEqual( - regex_toolkit.string_as_exp(text, RegexFlavor.RE2), - "".join(f"\\{char}" for char in ESCAPE_CHARS), - ) - - def test_only_unknown_chars(self): - text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - self.assertEqual( - regex_toolkit.string_as_exp(text, RegexFlavor.RE2), - r"".join( - ( - r"\x{0001f170}", - r"\x{0001f171}", - r"\x{0001f17e}", - r"\x{0001f17f}", - r"\x{0001f18e}", - r"\x{0001f191}", - r"\x{0001f192}", - r"\x{0001f193}", - r"\x{0001f194}", - r"\x{0001f195}", - r"\x{0001f196}", - r"\x{0001f197}", - r"\x{0001f198}", - r"\x{0001f199}", - r"\x{0001f19a}", - r"\x{0001f1e6}", - r"\x{0001f1e7}", - r"\x{0001f1e8}", - r"\x{0001f201}", - r"\x{0001f202}", - r"\x{0001f21a}", - r"\x{0001f22f}", - r"\x{0001f232}", - r"\x{0001f233}", - r"\x{0001f234}", - r"\x{0001f235}", - r"\x{0001f236}", - r"\x{0001f237}", - r"\x{0001f238}", - r"\x{0001f239}", - r"\x{0001f23a}", - r"\x{0001f250}", - r"\x{0001f251}", - r"\x{0001f300}", - r"\x{0001f301}", - r"\x{0001f302}", - r"\x{0001f303}", - r"\x{0001f304}", - # Length 2 - r"\x{0001f305}", - ) - ), - ) - - -class StringsAsExpressionRE(unittest.TestCase): - def test_only_safe(self): - # Unique combinations of SAFE_CHARS using various lengths (1-4). - # elements = tuple(SAFE_CHARS) - elements = SAFE_CHARS - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), - "|".join(texts), - ) - - def test_only_escapable_chars(self): - # Unique combinations of ESCAPE_CHARS using various lengths (1-4). - # elements = tuple(ESCAPE_CHARS) - elements = ESCAPE_CHARS - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), - "|".join(f"\\{text}" for text in texts), - ) - - def test_reserved_only(self): - # Unique combinations of reserved expressions using various lengths (1-4). - elements = ( - r"\A", - r"\b", - r"\B", - r"\d", - r"\D", - r"\s", - r"\S", - r"\w", - r"\W", - r"\Z", - r"\1", - ) - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), - "|".join(f"\\{text}" for text in texts), - ) - - def test_unsafe_only(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of UNSAFE_CHARS using various lengths (1-4). - # elements = tuple(UNSAFE_CHARS) - elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), - "|".join(f"\\{text}" for text in texts), - ) - - -###################### -###################### # Multiple unsafe char -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp([".", "!", "?"], RegexFlavor.RE), -###################### "\\.|\\!|\\?", -###################### ) -###################### -###################### for texts, expected_exp in [ -###################### (["πŸ…°"], "\\πŸ…°"), -###################### (["πŸ…°", "πŸ…±"], "\\πŸ…°|\\πŸ…±"), -###################### (["alpha", "beta"], "alpha|beta"), -###################### (["πŸ…°lpha", "πŸ…±eta"], "\\πŸ…°lpha|\\πŸ…±eta"), -###################### (["πŸ…°lpha", "Beta"], "\\πŸ…°lpha|Beta"), -###################### ]: -###################### self.assertEqual( -###################### regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), -###################### expected_exp, -###################### ) - - -class StringsAsExpressionRE2(unittest.TestCase): - def test_only_safe(self): - # Unique combinations of SAFE_CHARS using various lengths (1-4). - # elements = tuple(SAFE_CHARS) - elements = SAFE_CHARS - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE), - "|".join(texts), - ) - - def test_only_escapable_chars(self): - # Unique combinations of ESCAPE_CHARS using various lengths (1-4). - # elements = tuple(ESCAPE_CHARS) - elements = ESCAPE_CHARS - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), - "|".join(f"\\{text}" for text in texts), - ) - - def test_reserved_only(self): - # Unique combinations of reserved expressions using various lengths (1-4). - elements = ( - r"\A", - r"\b", - r"\B", - r"\d", - r"\D", - r"\s", - r"\S", - r"\w", - r"\W", - r"\Z", - r"\1", - ) - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), - "|".join(f"\\{text}" for text in texts), - ) - - def test_unsafe_only(self): - # TODO: Include text/chars such as punctuation, etc. - # Unique combinations of UNSAFE_CHARS using various lengths (1-4). - # elements = tuple(UNSAFE_CHARS) - elements_map = { - # Length 1 - "πŸ…°": r"\x{0001f170}", - "πŸ…±": r"\x{0001f171}", - "πŸ…Ύ": r"\x{0001f17e}", - "πŸ…Ώ": r"\x{0001f17f}", - "πŸ†Ž": r"\x{0001f18e}", - "πŸ†‘": r"\x{0001f191}", - "πŸ†’": r"\x{0001f192}", - "πŸ†“": r"\x{0001f193}", - "πŸ†”": r"\x{0001f194}", - "πŸ†•": r"\x{0001f195}", - "πŸ†–": r"\x{0001f196}", - "πŸ†—": r"\x{0001f197}", - "πŸ†˜": r"\x{0001f198}", - "πŸ†™": r"\x{0001f199}", - "πŸ†š": r"\x{0001f19a}", - "πŸ‡¦": r"\x{0001f1e6}", - "πŸ‡§": r"\x{0001f1e7}", - "πŸ‡¨": r"\x{0001f1e8}", - "🈁": r"\x{0001f201}", - "πŸˆ‚": r"\x{0001f202}", - "🈚": r"\x{0001f21a}", - "🈯": r"\x{0001f22f}", - "🈲": r"\x{0001f232}", - "🈳": r"\x{0001f233}", - "🈴": r"\x{0001f234}", - "🈡": r"\x{0001f235}", - "🈢": r"\x{0001f236}", - "🈷": r"\x{0001f237}", - "🈸": r"\x{0001f238}", - "🈹": r"\x{0001f239}", - "🈺": r"\x{0001f23a}", - "πŸ‰": r"\x{0001f250}", - "πŸ‰‘": r"\x{0001f251}", - "πŸŒ€": r"\x{0001f300}", - "🌁": r"\x{0001f301}", - "πŸŒ‚": r"\x{0001f302}", - "πŸŒƒ": r"\x{0001f303}", - "πŸŒ„": r"\x{0001f304}", - # Length 2 - "πŸŒ…": r"\x{0001f305}", - } - elements = tuple(elements_map) - for i in range(1, 5): - for texts in combinations_with_replacement(elements, i): - with self.subTest(texts=texts): - self.assertEqual( - regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), - "|".join(elements_map[text] for text in texts), - ) - - -##############################3 # Exact matches that equate to reserved spaces -##############################3 # E.g. Should match '\\' + 'd', not r'\d' -##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): -##############################3 texts = [text] -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), -##############################3 f"\\{text}", -##############################3 ) -##############################3 -##############################3 # Single whitespace char -##############################3 for texts in (["\n"], ["\v"], ["\t"], ["\r"], ["\f"], ["\v"]): -##############################3 with self.subTest(texts=texts): -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), texts[0]) -##############################3 -##############################3 # Single unsafe char -##############################3 for texts, expected_exp in [ -##############################3 (["."], "\\."), -##############################3 (["!"], "\\!"), -##############################3 (["?"], "\\?"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), -##############################3 expected_exp, -##############################3 ) -##############################3 -##############################3 # Multiple unsafe char -##############################3 texts = [".", "!", "?"] -##############################3 self.assertEqual(regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), "\\.|\\!|\\?") -##############################3 -##############################3 for texts, expected_exp in [ -##############################3 (["πŸ…°"], "\\x{0001f170}"), -##############################3 (["πŸ…°", "πŸ…±"], "\\x{0001f170}|\\x{0001f171}"), -##############################3 (["alpha", "beta"], "alpha|beta"), -##############################3 (["πŸ…°lpha", "πŸ…±eta"], "\\x{0001f170}lpha|\\x{0001f171}eta"), -##############################3 (["πŸ…°lpha", "Beta"], "\\x{0001f170}lpha|Beta"), -##############################3 ]: -##############################3 with self.subTest(texts=texts, expected_exp=expected_exp): -##############################3 self.assertEqual( -##############################3 regex_toolkit.strings_as_exp(texts, RegexFlavor.RE2), -##############################3 expected_exp, -##############################3 ) - -# TODO: Add tests for actually compiling the e. diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..c1fccdf --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,119 @@ +import unittest +from collections.abc import Generator, Iterable + +import regex_toolkit + + +def is_sorted_by_len(texts: Iterable[str], reverse: bool = False) -> bool: + prev_len = None + for text in texts: + if prev_len is None: + prev_len = len(text) + if reverse: + if len(text) > prev_len: + return False + else: + if len(text) < prev_len: + return False + prev_len = len(text) + return True + + +class TestSortByLength(unittest.TestCase): + def setUp(self) -> None: + self.texts = { + "apple", + "orange", + "banana", + "grape", + "apricot", + "cherry", + "plum", + "blueberry", + "strawberry", + "blackberry", + } + self.texts_by_type = ( + (set, self.texts), + (frozenset, frozenset(self.texts)), + (tuple, tuple(self.texts)), + (list, list(self.texts)), + (dict, dict.fromkeys(self.texts, None)), + ) + + def test_iter_sort_by_len(self): + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, + typed_texts=typed_texts, + reverse=reverse, + ): + result = regex_toolkit.iter_sort_by_len( + typed_texts, + reverse=reverse, + ) + self.assertIsInstance(result, Generator) + result_tuple = tuple(result) + self.assertTrue(is_sorted_by_len(result_tuple, reverse=reverse)) + self.assertEqual( + result_tuple, + tuple(sorted(typed_texts, key=len, reverse=reverse)), + ) + + def test_sort_by_len(self): + for try_type, typed_texts in self.texts_by_type: + for reverse in (False, True): + with self.subTest( + try_type=try_type, + typed_texts=typed_texts, + reverse=reverse, + ): + result = regex_toolkit.sort_by_len(typed_texts, reverse=reverse) + self.assertIsInstance(result, tuple) + self.assertTrue(is_sorted_by_len(result, reverse=reverse)) + self.assertEqual( + result, + tuple(sorted(typed_texts, key=len, reverse=reverse)), + ) + + +class TestIterCharRange(unittest.TestCase): + def test_iter_char_range(self): + result = regex_toolkit.iter_char_range("a", "z") + self.assertIsInstance(result, Generator) + self.assertTupleEqual( + tuple(result), + tuple("abcdefghijklmnopqrstuvwxyz"), + ) + + def test_char_range(self): + result = regex_toolkit.char_range("a", "z") + self.assertIsInstance(result, tuple) + self.assertTupleEqual( + result, + tuple("abcdefghijklmnopqrstuvwxyz"), + ) + + +class TestMasking(unittest.TestCase): + def setUp(self): + self.text = "This is an example" + + def test_insert_word(self): + indexes = (8, 8) + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "not "), + "This is not an example", + ) + + def test_replace_word(self): + indexes = (5, 7) + for try_type, typed_indexes in ((tuple, indexes), (list, list(indexes))): + with self.subTest(try_type=try_type, indexes=indexes): + self.assertEqual( + regex_toolkit.mask_span(self.text, typed_indexes, "isn't"), + "This isn't an example", + ) From 69dd03914e4951a6c08ac54a46467f359b09cb05 Mon Sep 17 00:00:00 2001 From: yaphott Date: Sun, 12 Mar 2023 16:56:59 -0500 Subject: [PATCH 5/7] Improved usage of frozenset. Further implementation of subtests. --- src/regex_toolkit/constants.py | 2 +- tests/test_base.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index 74e42b0..4bfba99 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -15,7 +15,7 @@ ASCIILETTERS: Final[frozenset[str]] = frozenset( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" ) -ALWAYS_SAFE: Final[frozenset[str]] = frozenset() | DIGITS | ASCIILETTERS +ALWAYS_SAFE: Final[frozenset[str]] = DIGITS | ASCIILETTERS ALWAYS_ESCAPED: Final[frozenset[str]] = frozenset( map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) diff --git a/tests/test_base.py b/tests/test_base.py index 87af87c..1c91029 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -252,7 +252,8 @@ def test_safe_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_escapable_of_variable_length(self): # Unique combinations of `ALWAYS_ESCAPED` using various lengths. @@ -265,7 +266,8 @@ def test_escapable_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_reserved_of_variable_length(self): # Unique combinations of reserved expressions using various lengths. @@ -290,7 +292,8 @@ def test_reserved_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_unsafe_of_variable_length(self): # TODO: Include text/chars such as punctuation, etc. @@ -304,7 +307,8 @@ def test_unsafe_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) ###################### @@ -344,7 +348,8 @@ def test_safe_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_escapable_of_variable_length(self): # Unique combinations of ALWAYS_ESCAPED using various lengths. @@ -357,7 +362,8 @@ def test_escapable_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_reserved_of_variable_length(self): # Unique combinations of reserved expressions using various lengths. @@ -382,7 +388,8 @@ def test_reserved_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) def test_unsafe_of_variable_length(self): # TODO: Include text/chars such as punctuation, etc. @@ -439,7 +446,8 @@ def test_unsafe_of_variable_length(self): # Ensure the expression compiles and matches each of the strings. pattern = self._re_compile(actual_exp) for text in texts: - self.assertTrue(pattern.match(text)) + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) ##############################3 # Exact matches that equate to reserved spaces From 870627554466d61b70f2132ea6ec5aaf94be644a Mon Sep 17 00:00:00 2001 From: yaphott Date: Tue, 14 Mar 2023 19:51:08 -0500 Subject: [PATCH 6/7] Better name for global var name `ALWAYS_ESCAPED` is `ALWAYS_ESCAPE`. Edited docstrings and correct some example parts that misrender in the readme generator. Update Makefile and add cli args to doc generating script. Added a couple tests and improved test method names. --- Makefile | 21 +++- README.md | 32 +++--- docs/render_readme.py | 19 +++- src/regex_toolkit/base.py | 4 +- src/regex_toolkit/constants.py | 15 ++- src/regex_toolkit/utils.py | 32 +++--- tests/test_base.py | 176 ++++++++++++++++++++++----------- 7 files changed, 199 insertions(+), 100 deletions(-) diff --git a/Makefile b/Makefile index 46d3437..39d152d 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,28 @@ install-dev: test: ${PYTHON} -m pytest tests +lint: + ${PYTHON} -m pylint src + +format: + ${PYTHON} -m isort src tests docs/render_readme.py + ${PYTHON} -m black src tests docs/render_readme.py + build: + @echo 'Building package' ${PYTHON} -m build + @echo 'Done' publish: + @echo 'Building package' ${PYTHON} -m build - twine upload dist/regex_toolkit-*.tar.gz dist/regex_toolkit-*.whl + @echo 'Uploading package' + ${PYTHON} -m twine upload dist/yogger-*.tar.gz dist/yogger-*.whl + @echo 'Done' + +readme: + @echo 'Generating README.md' + @cd docs && ${PYTHON} render_readme.py + @echo 'Copying README.md' + @cp ./docs/README.md ./README.md + @echo 'Done' diff --git a/README.md b/README.md index bdfe0e2..4749334 100644 --- a/README.md +++ b/README.md @@ -127,14 +127,16 @@ Sort strings by length. def ord_to_cpoint(ordinal: int) -> str ``` -Character codepoint from character ordinal. +Character ordinal to character codepoint. + +The codepoint is always 8 characters long (zero-padded). **Example**: - ```python - # Output: '00000061' - ord_to_cpoint(97) - ``` +```python +# Output: '00000061' +ord_to_cpoint(97) +``` **Arguments**: @@ -152,7 +154,7 @@ Character codepoint from character ordinal. def cpoint_to_ord(cpoint: str) -> int ``` -Character ordinal from character codepoint. +Character codepoint to character ordinal. **Arguments**: @@ -170,14 +172,14 @@ Character ordinal from character codepoint. def char_to_cpoint(char: str) -> str ``` -Character codepoint from character. +Character to character codepoint. **Example**: - ```python - # Output: '00000061' - char_to_cpoint("a") - ``` +```python +# Output: '00000061' +char_to_cpoint("a") +``` **Arguments**: @@ -216,7 +218,7 @@ def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None] ``` -Iterate all character within a range of codepoints (inclusive). +Iterate all characters within a range of codepoints (inclusive). **Arguments**: @@ -225,7 +227,7 @@ Iterate all character within a range of codepoints (inclusive). **Yields**: -- _str_ - Character from within a range of codepoints. +- _str_ - Characters within a range of codepoints. @@ -235,7 +237,7 @@ Iterate all character within a range of codepoints (inclusive). def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...] ``` -Tuple of all character within a range of codepoints (inclusive). +Tuple of all characters within a range of codepoints (inclusive). **Arguments**: @@ -289,7 +291,7 @@ Todo: Add support for overlapping (and unordered?) spans. **Arguments**: - `text` _str_ - String to slice. -- `spans` _Iterable[list[int] | tuple[int, int]]_ - Domains of index positions (x1, x2) to mask from the text. +- `spans` _Iterable[list[int] | tuple[int, int]]_ - Domains of index positions (x1, x2) to mask within the text. - `masks` _Iterable[str], optional_ - Masks to insert when slicing. Defaults to None. **Returns**: diff --git a/docs/render_readme.py b/docs/render_readme.py index fbf9aff..de92df3 100644 --- a/docs/render_readme.py +++ b/docs/render_readme.py @@ -1,8 +1,9 @@ -# import argparse +import argparse import json import logging import os import re +from pathlib import Path from jinja2 import Environment, FileSystemLoader from pydoc_markdown import PydocMarkdown @@ -104,9 +105,19 @@ def render_library_contents( file.write(rendered_contents) -def main() -> None: # config_file: str, template_file: str, output_file: str, replace: bool) -> None: +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config", + type=Path, + default="config.json", + help="Path to the config file.", + ) + args = parser.parse_args() + # Load the config - with open("config.json", mode="r", encoding="utf-8") as file: + with open(args.config, mode="r", encoding="utf-8") as file: config = json.loads(file.read()) # Generate the library documentation @@ -123,10 +134,10 @@ def main() -> None: # config_file: str, template_file: str, output_file: str, r environment = Environment(loader=loader, auto_reload=False) template = environment.get_template(config["main_template"]) rendered = template.render(**config["template_data"]) + with open(config["output_file"], mode="w", encoding="utf-8") as file: file.write(rendered) if __name__ == "__main__": - # TODO: Implement argparse here main() diff --git a/src/regex_toolkit/base.py b/src/regex_toolkit/base.py index a8e0508..ca6a983 100644 --- a/src/regex_toolkit/base.py +++ b/src/regex_toolkit/base.py @@ -5,7 +5,7 @@ ] from collections.abc import Iterable -from regex_toolkit.constants import ALWAYS_ESCAPED, ALWAYS_SAFE +from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor from regex_toolkit.utils import char_to_cpoint, iter_sort_by_len @@ -48,7 +48,7 @@ def _escape2(char: str) -> str: if char in ALWAYS_SAFE: # Safe as-is return char - elif char in ALWAYS_ESCAPED: + elif char in ALWAYS_ESCAPE: # Safe to escape with backslash return f"\\{char}" else: diff --git a/src/regex_toolkit/constants.py b/src/regex_toolkit/constants.py index 4bfba99..0d7cc43 100644 --- a/src/regex_toolkit/constants.py +++ b/src/regex_toolkit/constants.py @@ -5,17 +5,24 @@ from typing import Final +from regex_toolkit.enums import RegexFlavor + __all__ = [ + "ALWAYS_ESCAPE", "ALWAYS_SAFE", - "ALWAYS_ESCAPED", "ASCIILETTERS", "DIGITS", ] -DIGITS: Final[frozenset[str]] = frozenset("0123456789") + +DIGITS: Final[frozenset[str]] = frozenset(map(chr, b"0123456789")) ASCIILETTERS: Final[frozenset[str]] = frozenset( - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + map(chr, b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") ) ALWAYS_SAFE: Final[frozenset[str]] = DIGITS | ASCIILETTERS -ALWAYS_ESCAPED: Final[frozenset[str]] = frozenset( +ALWAYS_ESCAPE: Final[frozenset[str]] = frozenset( map(chr, b"()[]{}?*+-|^$\\.&~# \t\n\r\v\f") ) + +REGEX_FLAVORS: Final[frozenset[RegexFlavor]] = frozenset( + {RegexFlavor.RE, RegexFlavor.RE2} +) diff --git a/src/regex_toolkit/utils.py b/src/regex_toolkit/utils.py index 934d49c..67365c2 100644 --- a/src/regex_toolkit/utils.py +++ b/src/regex_toolkit/utils.py @@ -52,14 +52,16 @@ def sort_by_len( def ord_to_cpoint(ordinal: int) -> str: - """Character codepoint from character ordinal. + """Character ordinal to character codepoint. + + The codepoint is always 8 characters long (zero-padded). Example: - ```python - # Output: '00000061' - ord_to_cpoint(97) - ``` + ```python + # Output: '00000061' + ord_to_cpoint(97) + ``` Args: ordinal (int): Character ordinal. @@ -71,7 +73,7 @@ def ord_to_cpoint(ordinal: int) -> str: def cpoint_to_ord(cpoint: str) -> int: - """Character ordinal from character codepoint. + """Character codepoint to character ordinal. Args: cpoint (str): Character codepoint. @@ -83,14 +85,14 @@ def cpoint_to_ord(cpoint: str) -> int: def char_to_cpoint(char: str) -> str: - """Character codepoint from character. + """Character to character codepoint. Example: - ```python - # Output: '00000061' - char_to_cpoint("a") - ``` + ```python + # Output: '00000061' + char_to_cpoint("a") + ``` Args: char (str): Character. @@ -120,21 +122,21 @@ def to_nfc(text: str) -> str: def iter_char_range(first_cpoint: int, last_cpoint: int) -> Generator[str, None, None]: - """Iterate all character within a range of codepoints (inclusive). + """Iterate all characters within a range of codepoints (inclusive). Args: first_cpoint (int): Starting (first) codepoint. last_cpoint (int): Ending (last) codepoint. Yields: - str: Character from within a range of codepoints. + str: Characters within a range of codepoints. """ for i in range(ord(first_cpoint), ord(last_cpoint) + 1): yield chr(i) def char_range(first_cpoint: int, last_cpoint: int) -> tuple[str, ...]: - """Tuple of all character within a range of codepoints (inclusive). + """Tuple of all characters within a range of codepoints (inclusive). Args: first_cpoint (int): Starting (first) codepoint. @@ -185,7 +187,7 @@ def mask_spans( Args: text (str): String to slice. - spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask from the text. + spans (Iterable[list[int] | tuple[int, int]]): Domains of index positions (x1, x2) to mask within the text. masks (Iterable[str], optional): Masks to insert when slicing. Defaults to None. Returns: diff --git a/tests/test_base.py b/tests/test_base.py index 1c91029..afc1259 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -5,7 +5,7 @@ import re2 import regex_toolkit -from regex_toolkit.constants import ALWAYS_ESCAPED, ALWAYS_SAFE +from regex_toolkit.constants import ALWAYS_ESCAPE, ALWAYS_SAFE from regex_toolkit.enums import RegexFlavor @@ -14,7 +14,7 @@ def setUp(self): self._flavor = RegexFlavor.RE self._re_compile = re.compile - def test_safe_chars(self): + def test_safe(self): for char in ALWAYS_SAFE: with self.subTest(char=char): expected_exp = char @@ -24,8 +24,8 @@ def test_safe_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(char)) - def test_escapable_chars(self): - for char in ALWAYS_ESCAPED: + def test_escapable(self): + for char in ALWAYS_ESCAPE: with self.subTest(char=char): expected_exp = f"\\{char}" actual_exp = regex_toolkit.escape(char, self._flavor) @@ -34,7 +34,7 @@ def test_escapable_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(char)) - def test_unknown_chars(self): + def test_unknown(self): # TODO: Include additional characters to test. for char in "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…": with self.subTest(char=char): @@ -51,7 +51,7 @@ def setUp(self): self._flavor = RegexFlavor.RE2 self._re_compile = re2.compile - def test_safe_chars(self): + def test_safe(self): for char in ALWAYS_SAFE: with self.subTest(char=char): expected_exp = char @@ -61,8 +61,8 @@ def test_safe_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(char)) - def test_escapable_chars(self): - for char in ALWAYS_ESCAPED: + def test_escapable(self): + for char in ALWAYS_ESCAPE: with self.subTest(char=char): expected_exp = f"\\{char}" actual_exp = regex_toolkit.escape(char, self._flavor) @@ -71,8 +71,9 @@ def test_escapable_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(char)) - def test_unknown_chars(self): + def test_unknown(self): # TODO: Include additional characters to test. + # NOTE: Same as running: "\\x{" + format(ord("πŸŒ„"), "x").zfill(8) + "}" for char, expected_exp in ( # Length 1 ("πŸ…°", r"\x{0001f170}"), @@ -129,8 +130,21 @@ def setUp(self): self._flavor = RegexFlavor.RE self._re_compile = re.compile + def test_safe_individual_char(self): + # Single character. + for char in ALWAYS_SAFE: + with self.subTest(char=char): + text = char + expected_exp = char + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + # TODO: Add tests for mix of characters. - def test_safe_chars(self): + def test_safe_joined_as_one(self): + # All characters. text = "".join(ALWAYS_SAFE) expected_exp = text actual_exp = regex_toolkit.string_as_exp(text, self._flavor) @@ -139,16 +153,30 @@ def test_safe_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(text)) - def test_escapable_chars(self): - text = "".join(ALWAYS_ESCAPED) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPED) + def test_escapable_individual_char(self): + # Single character. + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + text = char + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_ESCAPE) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) actual_exp = regex_toolkit.string_as_exp(text, self._flavor) self.assertEqual(actual_exp, expected_exp) # Ensure the expression compiles and matches the string. pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(text)) - def test_unknown_chars(self): + def test_unsafe_joined_as_one(self): + # All characters. text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" expected_exp = "".join(f"\\{char}" for char in text) actual_exp = regex_toolkit.string_as_exp(text, self._flavor) @@ -164,7 +192,20 @@ def setUp(self): self._re_compile = re2.compile # TODO: Add tests for mix of characters. - def test_safe_chars(self): + def test_safe_individual_char(self): + # Single character. + for char in ALWAYS_SAFE: + with self.subTest(char=char): + text = char + expected_exp = char + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_safe_joined_as_one(self): + # All characters. text = "".join(ALWAYS_SAFE) expected_exp = "".join(ALWAYS_SAFE) actual_exp = regex_toolkit.string_as_exp(text, self._flavor) @@ -173,16 +214,29 @@ def test_safe_chars(self): pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(text)) - def test_escapable_chars(self): - text = "".join(ALWAYS_ESCAPED) - expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPED) + def test_escapable_individual_char(self): + # Single character. + for char in ALWAYS_ESCAPE: + with self.subTest(char=char): + text = char + expected_exp = f"\\{char}" + actual_exp = regex_toolkit.string_as_exp(text, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches the string. + pattern = self._re_compile(actual_exp) + self.assertTrue(pattern.match(text)) + + def test_escapable_joined_as_one(self): + # All characters. + text = "".join(ALWAYS_ESCAPE) + expected_exp = "".join(f"\\{char}" for char in ALWAYS_ESCAPE) actual_exp = regex_toolkit.string_as_exp(text, self._flavor) self.assertEqual(actual_exp, expected_exp) # Ensure the expression compiles and matches the string. pattern = self._re_compile(actual_exp) self.assertTrue(pattern.match(text)) - def test_unknown_chars(self): + def test_unknown_joined_as_one(self): text = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" expected_exp = r"".join( ( @@ -235,15 +289,20 @@ def test_unknown_chars(self): self.assertTrue(pattern.match(text)) +RESERVED_EXPRESSIONS = frozenset( + {r"\A", r"\b", r"\B", r"\d", r"\D", r"\s", r"\S", r"\w", r"\W", r"\Z", r"\1"} +) + + class StringsAsExpressionRE(unittest.TestCase): def setUp(self): self._flavor = RegexFlavor.RE self._re_compile = re.compile self._max_combo_length = 2 - def test_safe_of_variable_length(self): + def test_safe_of_various_lengths(self): # Unique combinations of `ALWAYS_SAFE` using various lengths. - elements = set(ALWAYS_SAFE) + elements = ALWAYS_SAFE for texts in product(elements, repeat=self._max_combo_length): with self.subTest(texts=texts): expected_exp = r"|".join(texts) @@ -255,9 +314,9 @@ def test_safe_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_escapable_of_variable_length(self): - # Unique combinations of `ALWAYS_ESCAPED` using various lengths. - elements = set(ALWAYS_ESCAPED) + def test_escapable_of_various_lengths(self): + # Unique combinations of `ALWAYS_ESCAPE` using various lengths. + elements = ALWAYS_ESCAPE for texts in product(elements, repeat=self._max_combo_length): with self.subTest(texts=texts): expected_exp = r"|".join(f"\\{text}" for text in texts) @@ -269,21 +328,11 @@ def test_escapable_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_reserved_of_variable_length(self): + def test_reserved_of_various_lengths(self): # Unique combinations of reserved expressions using various lengths. - elements = ( - r"\A", - r"\b", - r"\B", - r"\d", - r"\D", - r"\s", - r"\S", - r"\w", - r"\W", - r"\Z", - r"\1", - ) + # Exact matches that equate to reserved spaces + # E.g. Should match '\\' + 'n', not r'\n' + elements = RESERVED_EXPRESSIONS for texts in product(elements, repeat=self._max_combo_length): with self.subTest(texts=texts): expected_exp = r"|".join(f"\\{text}" for text in texts) @@ -295,7 +344,7 @@ def test_reserved_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_unsafe_of_variable_length(self): + def test_unsafe_of_various_lengths(self): # TODO: Include text/chars such as punctuation, etc. # Unique combinations of `ALWAYS_SAFE` using various lengths. elements = "πŸ…°πŸ…±πŸ…ΎπŸ…ΏπŸ†ŽπŸ†‘πŸ†’πŸ†“πŸ†”πŸ†•πŸ†–πŸ†—πŸ†˜πŸ†™πŸ†šπŸ‡¦πŸ‡§πŸ‡¨πŸˆπŸˆ‚πŸˆšπŸˆ―πŸˆ²πŸˆ³πŸˆ΄πŸˆ΅πŸˆΆπŸˆ·πŸˆΈπŸˆΉπŸˆΊπŸ‰πŸ‰‘πŸŒ€πŸŒπŸŒ‚πŸŒƒπŸŒ„πŸŒ…" @@ -310,6 +359,25 @@ def test_unsafe_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) + def test_safe_and_escapable_of_various_lengths(self): + # Unique combinations of `ALWAYS_SAFE` and `ALWAYS_ESCAPE` using various lengths. + elements = ALWAYS_SAFE | ALWAYS_ESCAPE + for texts in product(elements, repeat=self._max_combo_length): + with self.subTest(texts=texts): + expected_exp = r"|".join( + text if text in ALWAYS_SAFE else f"\\{text}" for text in texts + ) + actual_exp = regex_toolkit.strings_as_exp(texts, self._flavor) + self.assertEqual(actual_exp, expected_exp) + # Ensure the expression compiles and matches each of the strings. + pattern = self._re_compile(actual_exp) + for text in texts: + with self.subTest("match pattern", text=text): + self.assertTrue(pattern.match(text)) + + # def test_actual_examples(self): + # + ###################### ###################### # Multiple unsafe char @@ -337,7 +405,7 @@ def setUp(self): self._re_compile = re2.compile self._max_combo_length = 2 - def test_safe_of_variable_length(self): + def test_safe_of_variable_lengths(self): # Unique combinations of ALWAYS_SAFE using various lengths. elements = set(ALWAYS_SAFE) for texts in product(elements, repeat=self._max_combo_length): @@ -351,9 +419,9 @@ def test_safe_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_escapable_of_variable_length(self): - # Unique combinations of ALWAYS_ESCAPED using various lengths. - elements = set(ALWAYS_ESCAPED) + def test_escapable_of_variable_lengths(self): + # Unique combinations of ALWAYS_ESCAPE using various lengths. + elements = ALWAYS_ESCAPE for texts in product(elements, repeat=self._max_combo_length): with self.subTest(texts=texts): expected_exp = r"|".join(f"\\{text}" for text in texts) @@ -365,21 +433,11 @@ def test_escapable_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_reserved_of_variable_length(self): + def test_reserved_of_variable_lengths(self): # Unique combinations of reserved expressions using various lengths. - elements = ( - r"\A", - r"\b", - r"\B", - r"\d", - r"\D", - r"\s", - r"\S", - r"\w", - r"\W", - r"\Z", - r"\1", - ) + # Exact matches that equate to reserved spaces + # E.g. Should match '\\' + 'n', not r'\n' + elements = RESERVED_EXPRESSIONS for texts in product(elements, repeat=self._max_combo_length): with self.subTest(texts=texts): expected_exp = r"|".join(f"\\{text}" for text in texts) @@ -391,7 +449,7 @@ def test_reserved_of_variable_length(self): with self.subTest("match pattern", text=text): self.assertTrue(pattern.match(text)) - def test_unsafe_of_variable_length(self): + def test_unsafe_of_variable_lengths(self): # TODO: Include text/chars such as punctuation, etc. # Unique combinations of ALWAYS_SAFE using various lengths. elements_map = { @@ -451,7 +509,7 @@ def test_unsafe_of_variable_length(self): ##############################3 # Exact matches that equate to reserved spaces -##############################3 # E.g. Should match '\\' + 'd', not r'\d' +##############################3 # E.g. Should match '\\' + 'n', not r'\n' ##############################3 for text in (r"\w", r"\W", r"\d", r"\D", r"\s", r"\S", r"\1"): ##############################3 texts = [text] ##############################3 with self.subTest(texts=texts): From af45381ab1915044505b34c57b33e33cf3e4c165 Mon Sep 17 00:00:00 2001 From: yaphott Date: Tue, 14 Mar 2023 20:11:14 -0500 Subject: [PATCH 7/7] Version upgrade. --- src/regex_toolkit/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/regex_toolkit/__init__.py b/src/regex_toolkit/__init__.py index d878546..e7b8333 100644 --- a/src/regex_toolkit/__init__.py +++ b/src/regex_toolkit/__init__.py @@ -33,4 +33,4 @@ "to_nfc", "to_utf8", ] -__version__ = "0.0.3" +__version__ = "0.0.4"