diff --git a/.travis.yml b/.travis.yml index 6588db3e7..75179d4e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,9 +18,11 @@ install: os: - linux + # command to run tests, e.g. python setup.py test script: coverage run --source=pythainlp setup.py test + after_success: coveralls diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 62ecbcbb3..a10d62615 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,15 +19,14 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod ## Code Guidelines -- Use [PEP8](http://www.python.org/dev/peps/pep-0008/); +- Follows [PEP8](http://www.python.org/dev/peps/pep-0008/), use [black](https://github.com/ambv/black); - Write tests for your new features (please see "Tests" topic below); - Always remember that [commented code is dead code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); - Name identifiers (variables, classes, functions, module names) with meaningful and pronounceable names (`x` is always wrong); -- When manipulating strings, use [Python's new-style - formatting](http://docs.python.org/library/string.html#format-string-syntax) - (`'{} = {}'.format(a, b)` instead of `'%s = %s' % (a, b)`); +- When manipulating strings, use [f-String](https://www.python.org/dev/peps/pep-0498/) + (use `"{a} = {b}"`, instead of `"{} = {}".format(a, b)` and `"%s = %s' % (a, b)"`); - All `#TODO` comments should be turned into issues (use our [GitHub issue system](https://github.com/PyThaiNLP/pythainlp/)); - Run all tests before pushing (just execute `tox`) so you will know if your diff --git a/Makefile b/Makefile index d5c977215..0f103632c 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,6 @@ help: clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts - clean-build: ## remove build artifacts rm -fr build/ rm -fr dist/ @@ -51,19 +50,16 @@ lint: ## check style with flake8 flake8 pythainlp tests test: ## run tests quickly with the default Python - - python setup.py test + python setup.py test test-all: ## run tests on every Python version with tox tox coverage: ## check code coverage quickly with the default Python - - coverage run --source pythainlp setup.py test - - coverage report -m - coverage html - $(BROWSER) htmlcov/index.html + coverage run --source pythainlp setup.py test + coverage report -m + coverage html + $(BROWSER) htmlcov/index.html release: clean ## package and upload a release python setup.py sdist upload diff --git a/README-pypi.md b/README-pypi.md index 886a37edb..b779f157e 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -1,6 +1,6 @@ ![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4) -# PyThaiNLP 2.0 +# PyThaiNLP 2.0.2 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7)](https://www.codacy.com/app/pythainlp/pythainlp_2?utm_source=github.com&utm_medium=referral&utm_content=PyThaiNLP/pythainlp&utm_campaign=Badge_Grade)[![pypi](https://img.shields.io/pypi/v/pythainlp.svg)](https://pypi.python.org/pypi/pythainlp) [![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp.svg?branch=develop)](https://travis-ci.org/PyThaiNLP/pythainlp) @@ -12,9 +12,9 @@ PyThaiNLP is a Python library for natural language processing (NLP) of Thai lang PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers. -📖 For details on upgrading from PyThaiNLP 1.7 to PyThaiNLP 2.0, see [From PyThaiNLP 1.7 to PyThaiNLP 2.0](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) +📖 [Upgrading from PyThaiNLP 1.7 to 2.0](https://thainlp.org/pythainlp/docs/2.0/notes/pythainlp-1_7-2_0.html) -📖 For ThaiNER user after upgrading from PyThaiNLP 1.7 to PyThaiNLP 2.0, see [Upgrade ThaiNER from PyThaiNLP 1.7 to PyThaiNLP 2.0](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) +📖 [Upgrade ThaiNER from PyThaiNLP 1.7 to 2.0](https://github.com/PyThaiNLP/pythainlp/wiki/Upgrade-ThaiNER-from-PyThaiNLP-1.7-to-PyThaiNLP-2.0) ðŸ“Ŧ follow us on Facebook [Pythainlp](https://www.facebook.com/pythainlp/) diff --git a/README.md b/README.md index 880a579ca..e25c27d97 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,12 @@ Thai Natural Language Processing in Python. PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language. - [Current PyThaiNLP stable release is 2.0](https://github.com/PyThaiNLP/pythainlp/tree/master) -- PyThaiNLP 2.0 will support only Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). -- Python 2 users can use PyThaiNLP 1.6, our latest released that tested with Python 2.7. +- PyThaiNLP 2.0 supports Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 2.0 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). +- Python 2.7+ users can use PyThaiNLP 1.6. -**This is a document for development branch (post 1.7.x). Things will break. For a stable branch document, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** +**This is a document for development branch (post 2.0). Things will break. For a stable branch document, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** -ðŸ“Ŧ follow us on Facebook [Pythainlp](https://www.facebook.com/pythainlp/) +ðŸ“Ŧ follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/) ## Capabilities @@ -34,7 +34,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil - Thai misspellings detection and spelling correction (```spell```) - Thai soundex (```lk82```, ```udom83```, ```metasound```) - Thai WordNet wrapper -- and much more - see [examples](https://github.com/PyThaiNLP/pythainlp/tree/dev/examples). +- and much more - see examples in [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb). ## Installation @@ -102,13 +102,10 @@ PyThaiNLP āđ€āļ›āđ‡āļ™āđ„āļĨāļšāļēāļĢāļĩāļ āļēāļĐāļēāđ„āļžāļ—āļ­āļ™āđ€āļžāļ·āđˆ > āđ€āļžāļĢāļēāļ°āđ‚āļĨāļāļ‚āļąāļšāđ€āļ„āļĨāļ·āđˆāļ­āļ™āļ•āđˆāļ­āđ„āļ›āļ”āđ‰āļ§āļĒāļāļēāļĢāđāļšāđˆāļ‡āļ›āļąāļ™ -āļĢāļ­āļ‡āļĢāļąāļš Python 3.6 āļ‚āļķāđ‰āļ™āđ„āļ› +- PyThaiNLP 2.0 āļĢāļ­āļ‡āļĢāļąāļš Python 3.6 āļ‚āļķāđ‰āļ™āđ„āļ› +- āļœāļđāđ‰āđƒāļŠāđ‰ Python 2.7+ āļĒāļąāļ‡āļŠāļēāļĄāļēāļĢāļ–āđƒāļŠāđ‰ PyThaiNLP 1.6 āđ„āļ”āđ‰ -- āļ•āļąāđ‰āļ‡āđāļ•āđˆāļĢāļļāđˆāļ™ 1.7 PyThaiNLP āļˆāļ°āđ€āļĨāļīāļāļŠāļ™āļąāļšāļŠāļ™āļļāļ™ Python 2 (āļšāļēāļ‡āļŸāļąāļ‡āļāđŒāļŠāļąāļ™āļ­āļēāļˆāļĒāļąāļ‡āļ—āļģāļ‡āļēāļ™āđ„āļ”āđ‰ āđāļ•āđˆāļˆāļ°āđ„āļĄāđˆāđ„āļ”āđ‰āļĢāļąāļšāļāļēāļĢāļŠāļ™āļąāļšāļŠāļ™āļļāļ™) -- āļ•āļąāđ‰āļ‡āđāļ•āđˆāļĢāļļāđˆāļ™ 2.0 āļˆāļ°āļĒāļļāļ•āļīāļāļēāļĢāļĢāļ­āļ‡āļĢāļąāļš Python 2 āļ—āļąāđ‰āļ‡āļŦāļĄāļ” -- āļœāļđāđ‰āđƒāļŠāđ‰ Python 2 āļĒāļąāļ‡āļŠāļēāļĄāļēāļĢāļ–āđƒāļŠāđ‰ PyThaiNLP 1.6 āđ„āļ”āđ‰ - -**āđ€āļ­āļāļŠāļēāļĢāļ™āļĩāđ‰āļŠāļģāļŦāļĢāļąāļšāļĢāļļāđˆāļ™āļžāļąāļ’āļ™āļē (āļŦāļĨāļąāļ‡ 1.7.x) āļ­āļēāļˆāļĄāļĩāļāļēāļĢāđ€āļ›āļĨāļĩāđˆāļĒāļ™āđāļ›āļĨāļ‡āđ„āļ”āđ‰āļ•āļĨāļ­āļ” āļŠāļģāļŦāļĢāļąāļšāđ€āļ­āļāļŠāļēāļĢāļĢāļļāđˆāļ™āđ€āļŠāļ–āļĩāļĒāļĢ āļ”āļđāļ—āļĩāđˆ [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** +**āđ€āļ­āļāļŠāļēāļĢāļ™āļĩāđ‰āļŠāļģāļŦāļĢāļąāļšāļĢāļļāđˆāļ™āļžāļąāļ’āļ™āļē (āļŦāļĨāļąāļ‡ 2.0) āļ­āļēāļˆāļĄāļĩāļāļēāļĢāđ€āļ›āļĨāļĩāđˆāļĒāļ™āđāļ›āļĨāļ‡āđ„āļ”āđ‰āļ•āļĨāļ­āļ” āļŠāļģāļŦāļĢāļąāļšāđ€āļ­āļāļŠāļēāļĢāļĢāļļāđˆāļ™āđ€āļŠāļ–āļĩāļĒāļĢ āļ”āļđāļ—āļĩāđˆ [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** ðŸ“Ŧ āļ•āļīāļ”āļ•āļēāļĄāļ‚āđˆāļēāļ§āļŠāļēāļĢāđ„āļ”āđ‰āļ—āļĩāđˆ Facebook [Pythainlp](https://www.facebook.com/pythainlp/) @@ -125,7 +122,7 @@ PyThaiNLP āđ€āļ›āđ‡āļ™āđ„āļĨāļšāļēāļĢāļĩāļ āļēāļĐāļēāđ„āļžāļ—āļ­āļ™āđ€āļžāļ·āđˆ - āļ•āļĢāļ§āļˆāļ„āļģāļŠāļ°āļāļ”āļœāļīāļ”āđƒāļ™āļ āļēāļĐāļēāđ„āļ—āļĒ (```spell```) - soundex āļ āļēāļĐāļēāđ„āļ—āļĒ (```lk82```, ```udom83```, ```metasound```) - Thai WordNet wrapper -- āđāļĨāļ°āļ­āļ·āđˆāļ™ āđ† [āļ”āļđāļ•āļąāļ§āļ­āļĒāđˆāļēāļ‡](https://github.com/PyThaiNLP/pythainlp/tree/dev/examples) +- āđāļĨāļ°āļ­āļ·āđˆāļ™ āđ† āļ”āļđāļ•āļąāļ§āļ­āļĒāđˆāļēāļ‡āđ„āļ”āđ‰āđƒāļ™ [PyThaiNLP Get Started notebook](https://github.com/PyThaiNLP/pythainlp/blob/dev/notebooks/pythainlp-get-started.ipynb) ## āļ•āļīāļ”āļ•āļąāđ‰āļ‡ diff --git a/bin/pythainlp b/bin/pythainlp index 3582b89ad..1e3a68691 100644 --- a/bin/pythainlp +++ b/bin/pythainlp @@ -45,4 +45,4 @@ elif args.soundex!=None: args.engine="lk82" print(soundex(args.soundex, engine=args.engine)) else: - print("PyThaiNLP 2.0") + print("PyThaiNLP 2.0.2") diff --git a/conda.recipe/meta-old.yaml b/conda.recipe/meta-old.yaml deleted file mode 100644 index 632fb2109..000000000 --- a/conda.recipe/meta-old.yaml +++ /dev/null @@ -1,49 +0,0 @@ -{% set version = "1.7.2" %} - -package: - name: pythainlp - version: {{ version }} - -build: - noarch: python - number: 0 - script: python -m pip install --no-deps --ignore-installed . - -requirements: - host: - - pip - - python - - setuptools - - nltk - - future - - six - - marisa_trie - - dill - - pytz - - tinydb - - tqdm - - - run: - - python - - nltk - - future - - six - - marisa_trie - - dill - - pytz - - tinydb - - tqdm - -test: - imports: - - pvlib - -about: - home: https://github.com/PyThaiNLP/pythainlp - license: Apache License 2.0 - summary: 'Thai Natural Language Processing in Python.' - -extra: - recipe-maintainers: - - pythainlp diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 8e36acad6..f25188849 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "1.7.2" %} +{% set version = "2.0.2" %} package: name: pythainlp diff --git a/docs/api/spell.rst b/docs/api/spell.rst index 7544a58d5..b2c77736b 100644 --- a/docs/api/spell.rst +++ b/docs/api/spell.rst @@ -8,8 +8,4 @@ Modules ------- .. autofunction:: spell -.. autofunction:: pythainlp.spell.pn.spell -.. autofunction:: pythainlp.spell.pn.prob -.. autofunction:: pythainlp.spell.pn.correct -.. autofunction:: pythainlp.spell.pn.known -.. autofunction:: pythainlp.spell.pn.dictionary +.. autofunction:: correct diff --git a/docs/api/util.rst b/docs/api/util.rst index 1906fe48d..166f52375 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -14,9 +14,9 @@ Modules .. autofunction:: digit_to_text .. autofunction:: eng_to_thai .. autofunction:: find_keyword -.. autofunction:: is_thai -.. autofunction:: is_thaichar -.. autofunction:: is_thaiword +.. autofunction:: countthai +.. autofunction:: isthai +.. autofunction:: isthaichar .. autofunction:: normalize .. autofunction:: now_reign_year .. autofunction:: num_to_thaiword diff --git a/docs/pythainlp-1-3-thai.md b/docs/archive/pythainlp-1-3-thai.md similarity index 100% rename from docs/pythainlp-1-3-thai.md rename to docs/archive/pythainlp-1-3-thai.md diff --git a/docs/pythainlp-1-4-eng.md b/docs/archive/pythainlp-1-4-eng.md similarity index 100% rename from docs/pythainlp-1-4-eng.md rename to docs/archive/pythainlp-1-4-eng.md diff --git a/docs/pythainlp-1-4-eng.pdf b/docs/archive/pythainlp-1-4-eng.pdf similarity index 100% rename from docs/pythainlp-1-4-eng.pdf rename to docs/archive/pythainlp-1-4-eng.pdf diff --git a/docs/pythainlp-1-4-thai.md b/docs/archive/pythainlp-1-4-thai.md similarity index 100% rename from docs/pythainlp-1-4-thai.md rename to docs/archive/pythainlp-1-4-thai.md diff --git a/docs/pythainlp-1-4-thai.pdf b/docs/archive/pythainlp-1-4-thai.pdf similarity index 100% rename from docs/pythainlp-1-4-thai.pdf rename to docs/archive/pythainlp-1-4-thai.pdf diff --git a/docs/pythainlp-1-5-eng.md b/docs/archive/pythainlp-1-5-eng.md similarity index 100% rename from docs/pythainlp-1-5-eng.md rename to docs/archive/pythainlp-1-5-eng.md diff --git a/docs/pythainlp-1-5-thai.md b/docs/archive/pythainlp-1-5-thai.md similarity index 100% rename from docs/pythainlp-1-5-thai.md rename to docs/archive/pythainlp-1-5-thai.md diff --git a/docs/pythainlp-1-6-eng.md b/docs/archive/pythainlp-1-6-eng.md similarity index 100% rename from docs/pythainlp-1-6-eng.md rename to docs/archive/pythainlp-1-6-eng.md diff --git a/docs/pythainlp-1-6-thai.md b/docs/archive/pythainlp-1-6-thai.md similarity index 100% rename from docs/pythainlp-1-6-thai.md rename to docs/archive/pythainlp-1-6-thai.md diff --git a/docs/pythainlp-1-7.md b/docs/archive/pythainlp-1-7.md similarity index 100% rename from docs/pythainlp-1-7.md rename to docs/archive/pythainlp-1-7.md diff --git a/docs/pythainlp-dev-thai.md b/docs/archive/pythainlp-dev-thai.md similarity index 100% rename from docs/pythainlp-dev-thai.md rename to docs/archive/pythainlp-dev-thai.md diff --git a/docs/whatsnew-1.7.md b/docs/whatsnew-1.7.md deleted file mode 100644 index 768b6f450..000000000 --- a/docs/whatsnew-1.7.md +++ /dev/null @@ -1,12 +0,0 @@ -# āļĄāļĩāļ­āļ°āđ„āļĢāđƒāļŦāļĄāđˆāđƒāļ™ PyThaiNLP 1.7 - -## āļŠāļĢāļļāļ›āļ›āļĢāļ°āđ€āļ”āđ‡āļ™āļŠāļģāļ„āļąāļ - -- āđ€āļĨāļīāļāļŠāļ™āļąāļšāļŠāļ™āļļāļ™ Python 2.7 āļ­āļĒāđˆāļēāļ‡āđ€āļ›āđ‡āļ™āļ—āļēāļ‡āļāļēāļĢ -- āđ€āļžāļīāđˆāļĄ ULMFit utility -- āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āļĢāļ°āļšāļšāļ•āļąāļ”āļ„āļģāđƒāļŦāļĄāđˆ āļ—āļąāđ‰āļ‡ newmm āđāļĨāļ° mm -- thai2vec 0.2 -- sentiment analysis āļ•āļąāļ§āđƒāļŦāļĄāđˆāļ—āļģāļ‡āļēāļ™āļ”āđ‰āļ§āļĒ deep learning -- āđ€āļžāļīāđˆāļĄ thai2rom āđ€āļ›āđ‡āļ™ Thai romanization āļ—āļģāļ”āđ‰āļ§āļĒ deep learning āđƒāļ™āļĢāļ°āļ”āļąāļšāļ•āļąāļ§āļ­āļąāļāļĐāļĢ - -āļāļģāļĨāļąāļ‡āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡... diff --git a/examples/collate.py b/examples/collate.py deleted file mode 100644 index d4e30525e..000000000 --- a/examples/collate.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.util import collate - -print(collate(["āđ„āļāđˆ", "āđ„āļ‚āđˆ", "āļ", "āļŪāļē"])) # ['āļ', 'āđ„āļāđˆ', 'āđ„āļ‚āđˆ', 'āļŪāļē'] diff --git a/examples/date.py b/examples/date.py deleted file mode 100644 index 888d9c178..000000000 --- a/examples/date.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -import datetime -from pythainlp.util import thai_strftime - -fmt = "%Aāļ—āļĩāđˆ %-d %B āļž.āļĻ. %Y āđ€āļ§āļĨāļē %H:%Māļ™. (%a %d-%b-%y)" -date = datetime.datetime(1976, 10, 6, 1, 40) - -# āļ§āļąāļ™āļžāļļāļ˜āļ—āļĩāđˆ 6 āļ•āļļāļĨāļēāļ„āļĄ āļž.āļĻ. 2519 āđ€āļ§āļĨāļē 01:40āļ™. (āļž 06-āļ•.āļ„.-19) -print(thai_strftime(date, fmt)) diff --git a/examples/etcc.py b/examples/etcc.py deleted file mode 100644 index f732fdf11..000000000 --- a/examples/etcc.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import etcc - -print(etcc.etcc("āļ„āļ·āļ™āļ„āļ§āļēāļĄāļŠāļļāļ‚")) # /āļ„āļ·āļ™/āļ„āļ§āļēāļĄāļŠāļļāļ‚ diff --git a/examples/ner.py b/examples/ner.py deleted file mode 100644 index 773859e84..000000000 --- a/examples/ner.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tag.named_entity import ThaiNameTagger -ner = ThaiNameTagger() -print(ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.")) diff --git a/examples/normalize.py b/examples/normalize.py deleted file mode 100644 index cac000306..000000000 --- a/examples/normalize.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.util import normalize - -print(normalize("āđ€āđ€āļ›āļĨāļ") == "āđāļ›āļĨāļ") # āđ€ āđ€ āļ› āļĨ āļ āļāļąāļš āđāļ›āļĨāļ diff --git a/examples/soundex.py b/examples/soundex.py deleted file mode 100644 index 9864ac747..000000000 --- a/examples/soundex.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.soundex import lk82, metasound, udom83 - -texts = ["āļšāļđāļĢāļ“āļ°", "āļšāļđāļĢāļ“āļāļēāļĢ", "āļĄāļąāļ", "āļĄāļąāļ„", "āļĄāļĢāļĢāļ„", "āļĨāļąāļ", "āļĢāļąāļ", "āļĢāļąāļāļĐāđŒ", ""] -for text in texts: - print( - "{} - lk82: {} - udom83: {} - metasound: {}".format( - text, lk82(text), udom83(text), metasound(text) - ) - ) - -# check equivalence -print(lk82("āļĢāļ–") == lk82("āļĢāļ”")) -print(udom83("āļ§āļĢāļĢ") == udom83("āļ§āļąāļ™")) -print(metasound("āļ™āļž") == metasound("āļ™āļ ")) diff --git a/examples/spell.py b/examples/spell.py deleted file mode 100644 index 92dbc49f3..000000000 --- a/examples/spell.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.corpus import ttc -from pythainlp.spell import spell -from pythainlp.spell.pn import NorvigSpellChecker -from pythainlp.spell.pn import correct as pn_tnc_correct -from pythainlp.spell.pn import spell as pn_tnc_spell - -# spell checker from pythainlp.spell module (generic) -print(spell("āļŠāļĩāđˆāđ€āļŦāļĨāļĩāļĒāļĄ")) # ['āļŠāļĩāđˆāđ€āļŦāļĨāļĩāđˆāļĒāļĄ'] - -# spell checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's) -print(pn_tnc_spell("āđ€āļŦāļĨāļ·āļĒāļĄ")) -print(pn_tnc_correct("āđ€āļŦāļĨāļ·āļĒāļĄ")) - - -# spell checker from pythainlp.spell.pn module (specified algorithm, custom dictionary) -ttc_word_freqs = ttc.word_freqs() -pn_ttc_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs) -print(pn_ttc_checker.spell("āđ€āļŦāļĨāļ·āļĒāļĄ")) -print(pn_ttc_checker.correct("āđ€āļŦāļĨāļ·āļĒāļĄ")) - -# apply different dictionary filter when creating spell checker -pn_tnc_checker = NorvigSpellChecker() -print(len(pn_tnc_checker.dictionary())) -pn_tnc_checker_no_filter = NorvigSpellChecker(dict_filter=None) -print(len(pn_tnc_checker_no_filter.dictionary())) diff --git a/examples/tcc.py b/examples/tcc.py deleted file mode 100644 index 4d95aed43..000000000 --- a/examples/tcc.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import tcc - -print(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")) # āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ - -print(tcc.tcc_pos("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")) # {1, 3, 5, 6, 8, 9} - -for ch in tcc.tcc_gen("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"): # āļ›-āļĢāļ°-āđ€āļ—-āļĻ-āđ„āļ—-āļĒ- - print(ch, end='-') diff --git a/examples/tokenize.py b/examples/tokenize.py deleted file mode 100644 index 0b8a0d00b..000000000 --- a/examples/tokenize.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.tokenize import sent_tokenize, word_tokenize - -text = "āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ " -print(text) - -print(sent_tokenize(text)) -# ['āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ', ''] - -print(word_tokenize(text)) -# ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' '] - -print(word_tokenize(text, whitespaces=False)) -# ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ'] - -text2 = "āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™" -print(text2) - -print(word_tokenize(text2)) -# ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™'] - -print(word_tokenize(text2, engine="longest")) -# ['āļāļŽāļŦāļĄāļēāļĒ', 'āđāļĢāļ‡āļ‡āļēāļ™'] diff --git a/examples/transliterate.py b/examples/transliterate.py deleted file mode 100644 index 97fb4e7f1..000000000 --- a/examples/transliterate.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -from pythainlp.transliterate import romanize, transliterate - -print(romanize("āđāļĄāļ§")) -print(transliterate("āđāļĄāļ§")) diff --git a/meta.yaml b/meta.yaml index 0bc914207..714ecb262 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "1.7.2" %} +{% set version = "2.0.2" %} package: name: pythainlp diff --git a/notebooks/pythainlp-get-started.ipynb b/notebooks/pythainlp-get-started.ipynb new file mode 100644 index 000000000..806b9e47d --- /dev/null +++ b/notebooks/pythainlp-get-started.ipynb @@ -0,0 +1,1077 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyThaiNLP Get Started\n", + "\n", + "Code examples for basic functions in PyThaiNLP https://github.com/PyThaiNLP/pythainlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Thai Characters\n", + "\n", + "PyThaiNLP provides some ready-to-use Thai character set (e.g. Thai consonants, vowels, tonemarks, symbols) as a string for convenience. There are also few utility functions to test if a string is in Thai or not." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪāļĪāļĶāļ°āļąāļēāļģāļīāļĩāļķāļ·āļļāļđāđ€āđāđ‚āđƒāđ„āđ…āđ‡āđˆāđ‰āđŠāđ‹āļŊāđ†āļšāđŒāđāđŽāđāđšāđ›āđāđ‘āđ’āđ“āđ”āđ•āđ–āđ—āđ˜āđ™āļŋ'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp\n", + "\n", + "pythainlp.thai_characters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.thai_consonants" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"āđ”\" in pythainlp.thai_digits" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp.util\n", + "\n", + "pythainlp.util.isthai(\"āļ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.isthai(\"(āļ.āļž.)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.isthai(\"(āļ.āļž.)\", ignore_chars=\".()\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.countthai(\"āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.85714285714286" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.util.countthai(\"āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562\", ignore_chars=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Collation\n", + "\n", + "Sorting according to Thai dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļāļĢāļĢāđ„āļāļĢ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āđ„āļ‚āđˆ', 'āļ„āđ‰āļ­āļ™', 'āļœāđ‰āļēāđ„āļŦāļĄ']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import collate\n", + "\n", + "thai_words = [\"āļ„āđ‰āļ­āļ™\", \"āļāļĢāļ°āļ”āļēāļĐ\", \"āļāļĢāļĢāđ„āļāļĢ\", \"āđ„āļ‚āđˆ\", \"āļœāđ‰āļēāđ„āļŦāļĄ\"]\n", + "collate(thai_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļœāđ‰āļēāđ„āļŦāļĄ', 'āļ„āđ‰āļ­āļ™', 'āđ„āļ‚āđˆ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āļāļĢāļĢāđ„āļāļĢ']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collate(thai_words, reverse=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date and Time Format\n", + "\n", + "Get Thai day and month names with Thai Buddhist Era (B.E.)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļ§āļąāļ™āļžāļļāļ˜āļ—āļĩāđˆ 6 āļ•āļļāļĨāļēāļ„āļĄ āļž.āļĻ. 2519 āđ€āļ§āļĨāļē 01:40 āļ™. (āļž 06-āļ•.āļ„.-19)'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import datetime\n", + "from pythainlp.util import thai_strftime\n", + "\n", + "fmt = \"%Aāļ—āļĩāđˆ %-d %B āļž.āļĻ. %Y āđ€āļ§āļĨāļē %H:%M āļ™. (%a %d-%b-%y)\"\n", + "date = datetime.datetime(1976, 10, 6, 1, 40)\n", + "\n", + "thai_strftime(date, fmt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Thai Character Cluster (TCC) and Extended TCC\n", + "\n", + "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āļ›', 'āļĢāļ°', 'āđ€āļ—', 'āļĻ', 'āđ„āļ—', 'āļĒ']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import tcc\n", + "\n", + "tcc.tcc(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1, 3, 5, 6, 8, 9}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tcc.tcc_pos(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ/" + ] + } + ], + "source": [ + "for ch in tcc.tcc_gen(\"āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ\"):\n", + " print(ch, end='/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentence and Word\n", + "\n", + "Default word tokenizer (\"newmm\") use maximum matching algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sent_tokenize: ['āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ']\n", + "word_tokenize: ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', ' ']\n", + "word_tokenize, without whitespace: ['āļ‰āļąāļ™', 'āļĢāļąāļ', 'āļ āļēāļĐāļēāđ„āļ—āļĒ', 'āđ€āļžāļĢāļēāļ°', 'āļ‰āļąāļ™', 'āđƒāļŠāđ‰', 'āļ āļēāļĐāļēāđ„āļ—āļĒ']\n" + ] + } + ], + "source": [ + "from pythainlp import sent_tokenize, word_tokenize\n", + "\n", + "text = \"āļ‰āļąāļ™āļĢāļąāļāļ āļēāļĐāļēāđ„āļ—āļĒ āđ€āļžāļĢāļēāļ°āļ‰āļąāļ™āđƒāļŠāđ‰āļ āļēāļĐāļēāđ„āļ—āļĒ \"\n", + "\n", + "print(\"sent_tokenize:\", sent_tokenize(text))\n", + "print(\"word_tokenize:\", word_tokenize(text))\n", + "print(\"word_tokenize, without whitespace:\", word_tokenize(text, whitespaces=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Other algorithm can be chosen. We can also create a tokenizer with custom dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "newmm: ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻ', 'āđƒāļŠāđ‰āđāļĨāđ‰āļ§']\n", + "longest: ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰', 'āđāļĨāđ‰āļ§']\n", + "custom: ['āļāļŽ', 'āļŦāļĄāļēāļĒāđāļĢāļ‡', 'āļ‡āļēāļ™', 'āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§']\n" + ] + } + ], + "source": [ + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text)) # default engine is \"newmm\"\n", + "print(\"longest:\", word_tokenize(text, engine=\"longest\"))\n", + "\n", + "words = [\"āļāļŽ\", \"āļ‡āļēāļ™\"]\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Default word tokenizer use a word list from pythainlp.corpus.common.thai_words().\n", + "We can get that list, add/remove words, and create new tokenizer from the modified list." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "newmm: ['āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļī', 'āļĄāļ­', 'āļŸ']\n", + "custom: ['āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļīāļĄāļ­āļŸ']\n" + ] + } + ], + "source": [ + "from pythainlp.corpus.common import thai_words\n", + "from pythainlp import word_tokenize, Tokenizer\n", + "\n", + "text = \"āđ„āļ­āđāļ‹āļ„ āļ­āļŠāļīāļĄāļ­āļŸ\"\n", + "\n", + "print(\"newmm:\", word_tokenize(text))\n", + "\n", + "words = set(thai_words()) # thai_words() returns frozenset\n", + "words.add(\"āļ­āļŠāļīāļĄāļ­āļŸ\")\n", + "custom_tokenizer = Tokenizer(words)\n", + "print(\"custom:\", custom_tokenizer.word_tokenize(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transliteration" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'maeo'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.transliterate import romanize\n", + "\n", + "romanize(\"āđāļĄāļ§\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mɛːw\n" + ] + } + ], + "source": [ + "from pythainlp.transliterate import transliterate\n", + "\n", + "print(transliterate(\"āđāļĄāļ§\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalization" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import normalize\n", + "\n", + "normalize(\"āđ€āđ€āļ›āļĨāļ\") == \"āđāļ›āļĨāļ\" # āđ€ āđ€ āļ› āļĨ āļ vs āđāļ›āļĨāļ" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Soundex\n", + "\n", + "\"Soundex is a phonetic algorithm for indexing names by sound.\" ([Wikipedia](https://en.wikipedia.org/wiki/Soundex)). PyThaiNLP provides three kinds of Thai soundex." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "from pythainlp.soundex import lk82, metasound, udom83\n", + "\n", + "# check equivalence\n", + "print(lk82(\"āļĢāļ–\") == lk82(\"āļĢāļ”\"))\n", + "print(udom83(\"āļ§āļĢāļĢ\") == udom83(\"āļ§āļąāļ™\"))\n", + "print(metasound(\"āļ™āļž\") == metasound(\"āļ™āļ \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āļšāļđāļĢāļ“āļ° - lk82: āļšE400 - udom83: āļš930000 - metasound: āļš550\n", + "āļšāļđāļĢāļ“āļāļēāļĢ - lk82: āļšE419 - udom83: āļš931900 - metasound: āļš551\n", + "āļĄāļąāļ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100\n", + "āļĄāļąāļ„ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100\n", + "āļĄāļĢāļĢāļ„ - lk82: āļĄ1000 - udom83: āļĄ310000 - metasound: āļĄ551\n", + "āļĨāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĨ100\n", + "āļĢāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100\n", + "āļĢāļąāļāļĐāđŒ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100\n", + " - lk82: - udom83: - metasound: \n" + ] + } + ], + "source": [ + "texts = [\"āļšāļđāļĢāļ“āļ°\", \"āļšāļđāļĢāļ“āļāļēāļĢ\", \"āļĄāļąāļ\", \"āļĄāļąāļ„\", \"āļĄāļĢāļĢāļ„\", \"āļĨāļąāļ\", \"āļĢāļąāļ\", \"āļĢāļąāļāļĐāđŒ\", \"\"]\n", + "for text in texts:\n", + " print(\n", + " \"{} - lk82: {} - udom83: {} - metasound: {}\".format(\n", + " text, lk82(text), udom83(text), metasound(text)\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spellchecking\n", + "\n", + "Default spellchecker uses [Peter Norvig's algorithm](http://www.norvig.com/spell-correct.html) together with word frequency from Thai National Corpus (TNC)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['āđ€āļŦāļĨāļĩāļĒāļĄ', 'āđ€āļŦāļĨāļ·āļ­āļĄ']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import spell\n", + "\n", + "# list possible spellings\n", + "spell(\"āđ€āļŦāļĨāļ·āļĒāļĄ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āđ€āļŦāļĨāļĩāļĒāļĄ'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp import correct\n", + "\n", + "# choose the most likely spelling\n", + "correct(\"āđ€āļŦāļĨāļ·āļĒāļĄ\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spellchecking - Custom dictionary and word frequency\n", + "\n", + "Custom dictionary can be provided when creating spellchecker." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['āđ€āļŦāļĨāļ·āļ­āļĄ']\n", + "āđ€āļŦāļĨāļ·āļ­āļĄ\n" + ] + } + ], + "source": [ + "from pythainlp.corpus import ttc # Thai Textbook Corpus\n", + "from pythainlp.spell import NorvigSpellChecker\n", + "\n", + "checker = NorvigSpellChecker(custom_dict=ttc.word_freqs())\n", + "print(checker.spell(\"āđ€āļŦāļĨāļ·āļĒāļĄ\"))\n", + "print(checker.correct(\"āđ€āļŦāļĨāļ·āļĒāļĄ\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļˆāļ°', 51681),\n", + " ('āđ€āļ›āđ‡āļ™', 51273),\n", + " ('āđ„āļ›', 46567),\n", + " ('āļāđ‡', 46409),\n", + " ('āđ„āļĄāđˆ', 45895),\n", + " ('āļĄāļĩ', 44899),\n", + " ('āđ„āļ”āđ‰', 44513),\n", + " ('āļ§āđˆāļē', 40290),\n", + " ('āđƒāļŦāđ‰', 38715)]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(checker.dictionary())[1:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also apply conditions and filter function to dictionary when creating spellchecker." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39977" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker = NorvigSpellChecker() # use default filter (remove any word with number or non-Thai character)\n", + "len(checker.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30379" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker = NorvigSpellChecker(min_freq=5, min_len=2, max_len=15)\n", + "len(checker.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76706" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checker_no_filter = NorvigSpellChecker(dict_filter=None) # use no filter\n", + "len(checker_no_filter.dictionary())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76700" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def remove_yamok(word):\n", + " return False if \"āđ†\" in word else True\n", + "\n", + "checker_custom_filter = NorvigSpellChecker(dict_filter=remove_yamok) # use custom filter\n", + "len(checker_custom_filter.dictionary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part-of-Speech Tagging" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļāļēāļĢ', 'FIXN'), ('āđ€āļ”āļīāļ™āļ—āļēāļ‡', 'VACT')]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.tag import pos_tag, pos_tag_sents\n", + "\n", + "pos_tag([\"āļāļēāļĢ\",\"āđ€āļ”āļīāļ™āļ—āļēāļ‡\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[('āļĢāļēāļŠāļāļīāļˆāļˆāļēāļ™āļļāđ€āļšāļāļĐāļē', 'NCMN'),\n", + " ('āđ€āļœāļĒāđāļžāļĢāđˆ', 'VACT'),\n", + " ('āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āđƒāļŦāđ‰', 'JSBR'),\n", + " (' ', 'PUNC'),\n", + " (\"'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'\", 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ', 'NCMN'),\n", + " ('āļāļ­āļ‡āļ—āļąāļžāļšāļ', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " ('āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ', 'NCMN')],\n", + " [('āđāļĨāļ°', 'JCRG'),\n", + " ('āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡', 'VACT'),\n", + " ('āđƒāļŦāđ‰', 'JSBR'),\n", + " ('āđ€āļ›āđ‡āļ™', 'VSTA'),\n", + " ('āļ‚āđ‰āļēāļĢāļēāļŠāļāļēāļĢ', 'NCMN'),\n", + " ('āļžāļĨāđ€āļĢāļ·āļ­āļ™', 'NCMN'),\n", + " ('āļŠāļēāļĄāļąāļ', 'NCMN'),\n", + " ('āļ•āļģāđāļŦāļ™āđˆāļ‡', 'NCMN'),\n", + " (' ', 'PUNC'),\n", + " (\"'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'\", 'NCMN')]]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sents = [[\"āļĢāļēāļŠāļāļīāļˆāļˆāļēāļ™āļļāđ€āļšāļāļĐāļē\", \"āđ€āļœāļĒāđāļžāļĢāđˆ\", \"āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ\", \" \", \"āđƒāļŦāđ‰\",\n", + " \" \", \"'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'\", \" \", \"āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡\",\n", + " \" \", \"āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ\", \"āļāļ­āļ‡āļ—āļąāļžāļšāļ\", \" \", \"āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ\"],\n", + " [\"āđāļĨāļ°\",\"āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡\",\"āđƒāļŦāđ‰\", \"āđ€āļ›āđ‡āļ™\", \"āļ‚āđ‰āļēāļĢāļēāļŠāļāļēāļĢ\", \"āļžāļĨāđ€āļĢāļ·āļ­āļ™\", \"āļŠāļēāļĄāļąāļ\",\n", + " \"āļ•āļģāđāļŦāļ™āđˆāļ‡\", \" \", \"'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'\"]]\n", + "\n", + "pos_tag_sents(sents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Named-Entity Tagging\n", + "\n", + "The tagger use BIO scheme:\n", + "- B - beginning of entity\n", + "- I - inside entity\n", + "- O - outside entity" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('āļ§āļąāļ™āļ—āļĩāđˆ', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('15', 'NUM', 'B-DATE'),\n", + " (' ', 'PUNCT', 'I-DATE'),\n", + " ('āļ.āļĒ.', 'NOUN', 'I-DATE'),\n", + " (' ', 'PUNCT', 'I-DATE'),\n", + " ('61', 'NUM', 'I-DATE'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'),\n", + " ('āļĢāļ°āļšāļš', 'NOUN', 'O'),\n", + " ('āđ€āļ§āļĨāļē', 'NOUN', 'O'),\n", + " (' ', 'PUNCT', 'O'),\n", + " ('14', 'NOUN', 'B-TIME'),\n", + " (':', 'PUNCT', 'I-TIME'),\n", + " ('49', 'NUM', 'I-TIME'),\n", + " (' ', 'PUNCT', 'I-TIME'),\n", + " ('āļ™.', 'NOUN', 'I-TIME')]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.tag.named_entity import ThaiNameTagger\n", + "\n", + "ner = ThaiNameTagger()\n", + "ner.get_ner(\"āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English\n", + "INFO:gensim.models.utils_any2vec:loading projection weights from /Users/arthit/pythainlp-data/thai2vec.bin\n", + "INFO:gensim.models.utils_any2vec:loaded (60001, 400) matrix from /Users/arthit/pythainlp-data/thai2vec.bin\n", + "/usr/local/lib/python3.7/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "0.99259853" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pythainlp.word_vector\n", + "\n", + "pythainlp.word_vector.similarity(\"āļ„āļ™\", \"āļĄāļ™āļļāļĐāļĒāđŒ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "'āđāļĄāļ§'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythainlp.word_vector.doesnt_match([\"āļ„āļ™\", \"āļĄāļ™āļļāļĐāļĒāđŒ\", \"āļšāļļāļ„āļ„āļĨ\", \"āđ€āļˆāđ‰āļēāļŦāļ™āđ‰āļēāļ—āļĩāđˆ\", \"āđāļĄāļ§\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Number Spell Out" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļŦāļ™āļķāđˆāļ‡āļĨāđ‰āļēāļ™āļŠāļ­āļ‡āđāļŠāļ™āļŠāļēāļĄāļŦāļĄāļ·āđˆāļ™āļŠāļĩāđˆāļžāļąāļ™āļŦāđ‰āļēāļĢāđ‰āļ­āļĒāļŦāļāļŠāļīāļšāđ€āļˆāđ‡āļ”āļĨāđ‰āļēāļ™āđāļ›āļ”āđāļŠāļ™āđ€āļāđ‰āļēāļŦāļĄāļ·āđˆāļ™āļŦāļ™āļķāđˆāļ‡āļĢāđ‰āļ­āļĒāļĒāļĩāđˆāļŠāļīāļšāļŠāļēāļĄāļšāļēāļ—āļŠāļĩāđˆāļŠāļīāļšāļŦāđ‰āļēāļŠāļ•āļēāļ‡āļ„āđŒ'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pythainlp.util import bahttext\n", + "\n", + "bahttext(1234567890123.45)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'āļŦāļ™āļķāđˆāļ‡āļšāļēāļ—āđ€āļāđ‰āļēāļŠāļīāļšāđ€āļ­āđ‡āļ”āļŠāļ•āļēāļ‡āļ„āđŒ'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bahttext(1.909)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/sentiment_analysis.ipynb b/notebooks/sentiment_analysis.ipynb index 58b659687..a1ab56694 100644 --- a/notebooks/sentiment_analysis.ipynb +++ b/notebooks/sentiment_analysis.ipynb @@ -47,12 +47,14 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", + "import re\n", + "\n", + "import emoji\n", "import numpy as np\n", + "import pandas as pd\n", + "\n", "from pythainlp import word_tokenize\n", "from tqdm import tqdm_notebook\n", - "import re\n", - "import emoji\n", "\n", "#viz\n", "import matplotlib.pyplot as plt\n", @@ -79,8 +81,8 @@ "def replace_rep(text):\n", " def _replace_rep(m):\n", " c,cc = m.groups()\n", - " return f'{c}xxrep'\n", - " re_rep = re.compile(r'(\\S)(\\1{2,})')\n", + " return f\"{c}xxrep\"\n", + " re_rep = re.compile(r\"(\\S)(\\1{2,})\")\n", " return re_rep.sub(_replace_rep, text)\n", "\n", "def ungroup_emoji(toks):\n", @@ -100,7 +102,7 @@ " res = replace_rep(res)\n", " \n", " #tokenize\n", - " res = [word for word in word_tokenize(res, engine='ulmfit') if word and not re.search(pattern=r\"\\s+\", string=word)]\n", + " res = [word for word in word_tokenize(res, engine=\"ulmfit\") if word and not re.search(pattern=r\"\\s+\", string=word)]\n", " \n", " #post rules\n", " res = ungroup_emoji(res)\n", @@ -123,15 +125,13 @@ }, "outputs": [], "source": [ - "with open('train.txt') as f:\n", + "with open(\"train.txt\") as f:\n", " texts = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "with open('train_label.txt') as f:\n", + "with open(\"train_label.txt\") as f:\n", " categories = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "all_df = pd.DataFrame({'category':categories, 'texts':texts})\n", + "all_df = pd.DataFrame({\"category\":categories, \"texts\":texts})\n", "all_df.shape" ] }, @@ -141,11 +141,10 @@ "metadata": {}, "outputs": [], "source": [ - "with open('test.txt') as f:\n", + "with open(\"test.txt\") as f:\n", " texts = [line.strip() for line in f.readlines()]\n", - "f.close()\n", "\n", - "test_df = pd.DataFrame({'category':'test', 'texts':texts})\n", + "test_df = pd.DataFrame({\"category\":\"test\", \"texts\":texts})\n", "test_df.shape" ] }, @@ -162,16 +161,16 @@ "metadata": {}, "outputs": [], "source": [ - "all_df = pd.read_csv('all_df.csv')\n", - "test_df = pd.read_csv('test_df.csv')\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", + "test_df = pd.read_csv(\"test_df.csv\")\n", "\n", - "all_df['processed'] = all_df.texts.map(lambda x: '|'.join(process_text(x)))\n", - "all_df['wc'] = all_df.processed.map(lambda x: len(x.split('|')))\n", - "all_df['uwc'] = all_df.processed.map(lambda x: len(set(x.split('|'))))\n", + "all_df[\"processed\"] = all_df.texts.map(lambda x: \"|\".join(process_text(x)))\n", + "all_df[\"wc\"] = all_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "all_df[\"uwc\"] = all_df.processed.map(lambda x: len(set(x.split(\"|\"))))\n", "\n", - "test_df['processed'] = test_df.texts.map(lambda x: '|'.join(process_text(x)))\n", - "test_df['wc'] = test_df.processed.map(lambda x: len(x.split('|')))\n", - "test_df['uwc'] = test_df.processed.map(lambda x: len(set(x.split('|'))))" + "test_df[\"processed\"] = test_df.texts.map(lambda x: \"|\".join(process_text(x)))\n", + "test_df[\"wc\"] = test_df.processed.map(lambda x: len(x.split(\"|\")))\n", + "test_df[\"uwc\"] = test_df.processed.map(lambda x: len(set(x.split(\"|\"))))" ] }, { @@ -352,7 +351,7 @@ ], "source": [ "#prevalence\n", - "print(train_df['category'].value_counts() / train_df.shape[0])" + "print(train_df[\"category\"].value_counts() / train_df.shape[0])" ] }, { @@ -374,7 +373,7 @@ ], "source": [ "#prevalence\n", - "print(valid_df['category'].value_counts() / valid_df.shape[0])" + "print(valid_df[\"category\"].value_counts() / valid_df.shape[0])" ] }, { @@ -398,8 +397,8 @@ "outputs": [], "source": [ "#dependent variables\n", - "y_train = train_df['category']\n", - "y_valid = valid_df['category']" + "y_train = train_df[\"category\"]\n", + "y_valid = valid_df[\"category\"]" ] }, { @@ -424,10 +423,10 @@ "from sklearn.linear_model import LogisticRegression\n", "\n", "tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)\n", - "tfidf_fit = tfidf.fit(all_df['texts'])\n", - "text_train = tfidf_fit.transform(train_df['texts'])\n", - "text_valid = tfidf_fit.transform(valid_df['texts'])\n", - "text_test = tfidf_fit.transform(test_df['texts'])\n", + "tfidf_fit = tfidf.fit(all_df[\"texts\"])\n", + "text_train = tfidf_fit.transform(train_df[\"texts\"])\n", + "text_valid = tfidf_fit.transform(valid_df[\"texts\"])\n", + "text_test = tfidf_fit.transform(test_df[\"texts\"])\n", "text_train.shape, text_valid.shape" ] }, @@ -459,11 +458,11 @@ "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", - "scaler_fit = scaler.fit(all_df[['wc','uwc']].astype(float))\n", + "scaler_fit = scaler.fit(all_df[[\"wc\",\"uwc\"]].astype(float))\n", "print(scaler_fit.mean_, scaler_fit.var_)\n", - "num_train = scaler_fit.transform(train_df[['wc','uwc']].astype(float))\n", - "num_valid = scaler_fit.transform(valid_df[['wc','uwc']].astype(float))\n", - "num_test = scaler_fit.transform(test_df[['wc','uwc']].astype(float))\n", + "num_train = scaler_fit.transform(train_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_valid = scaler_fit.transform(valid_df[[\"wc\",\"uwc\"]].astype(float))\n", + "num_test = scaler_fit.transform(test_df[[\"wc\",\"uwc\"]].astype(float))\n", "num_train.shape, num_valid.shape" ] }, @@ -516,7 +515,7 @@ ], "source": [ "#fit logistic regression models\n", - "model = LogisticRegression(C=2., penalty='l2', solver='liblinear', dual=False, multi_class='ovr')\n", + "model = LogisticRegression(C=2., penalty=\"l2\", solver=\"liblinear\", dual=False, multi_class=\"ovr\")\n", "model.fit(X_train,y_train)\n", "model.score(X_valid,y_valid)" ] @@ -537,14 +536,14 @@ "probs = model.predict_proba(X_valid)\n", "probs_df = pd.DataFrame(probs)\n", "probs_df.columns = model.classes_\n", - "probs_df['preds'] = model.predict(X_valid)\n", - "probs_df['category'] = valid_df.category\n", - "probs_df['texts'] = valid_df.texts\n", - "probs_df['processed'] = valid_df.processed\n", - "probs_df['wc'] = valid_df.wc\n", - "probs_df['uwc'] = valid_df.uwc\n", - "probs_df['hit'] = (probs_df.preds==probs_df.category)\n", - "probs_df.to_csv('probs_df_linear.csv',index=False)" + "probs_df[\"preds\"] = model.predict(X_valid)\n", + "probs_df[\"category\"] = valid_df.category\n", + "probs_df[\"texts\"] = valid_df.texts\n", + "probs_df[\"processed\"] = valid_df.processed\n", + "probs_df[\"wc\"] = valid_df.wc\n", + "probs_df[\"uwc\"] = valid_df.uwc\n", + "probs_df[\"hit\"] = (probs_df.preds==probs_df.category)\n", + "probs_df.to_csv(\"probs_df_linear.csv\", index=False)" ] }, { @@ -577,10 +576,10 @@ "\n", "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", "print(model.score(X_valid,y_valid))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", " xticklabels=model.classes_, yticklabels=model.classes_)\n", - "plt.ylabel('Actual')\n", - "plt.xlabel('Predicted')\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", "plt.show()" ] }, @@ -601,8 +600,8 @@ "from fastai.callbacks import CSVLogger, SaveModelCallback\n", "from pythainlp.ulmfit import *\n", "\n", - "model_path = 'wisesight_data/'\n", - "all_df = pd.read_csv('all_df.csv')\n", + "model_path = \"wisesight_data/\"\n", + "all_df = pd.read_csv(\"all_df.csv\")\n", "train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)" ] }, @@ -619,11 +618,11 @@ "metadata": {}, "outputs": [], "source": [ - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", " NumericalizeProcessor(vocab=None, max_vocab=60000, min_freq=2)]\n", "\n", - "data_lm = (TextList.from_df(all_df, model_path, cols='texts', processor=processor)\n", + "data_lm = (TextList.from_df(all_df, model_path, cols=\"texts\", processor=processor)\n", " .random_split_by_pct(valid_pct = 0.01, seed = 1412)\n", " .label_for_lm()\n", " .databunch(bs=48))\n", @@ -708,7 +707,7 @@ ], "source": [ "#train frozen\n", - "print('training frozen')\n", + "print(\"training frozen\")\n", "learn.freeze_to(-1)\n", "learn.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))" ] @@ -777,7 +776,7 @@ ], "source": [ "#train unfrozen\n", - "print('training unfrozen')\n", + "print(\"training unfrozen\")\n", "learn.unfreeze()\n", "learn.fit_one_cycle(5, 1e-3, moms=(0.8, 0.7))" ] @@ -789,7 +788,7 @@ "outputs": [], "source": [ "# learn.save('wisesight_lm')\n", - "learn.save_encoder('wisesight_enc')" + "learn.save_encoder(\"wisesight_enc\")" ] }, { @@ -814,17 +813,17 @@ ], "source": [ "#lm data\n", - "data_lm = load_data(model_path,'wisesight_lm.pkl')\n", + "data_lm = load_data(model_path, \"wisesight_lm.pkl\")\n", "data_lm.sanity_check()\n", "\n", "#classification data\n", - "tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th)\n", + "tt = Tokenizer(tok_func=ThaiTokenizer, lang=\"th\", pre_rules=pre_rules_th, post_rules=post_rules_th)\n", "processor = [TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),\n", " NumericalizeProcessor(vocab=data_lm.vocab, max_vocab=60000, min_freq=20)]\n", "\n", - "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=['texts'], processor=processor),\n", - " valid=TextList.from_df(valid_df, model_path, cols=['texts'], processor=processor))\n", - " .label_from_df('category')\n", + "data_cls = (ItemLists(model_path,train=TextList.from_df(train_df, model_path, cols=[\"texts\"], processor=processor),\n", + " valid=TextList.from_df(valid_df, model_path, cols=[\"texts\"], processor=processor))\n", + " .label_from_df(\"category\")\n", " .databunch(bs=50)\n", " )\n", "data_cls.sanity_check()\n", @@ -844,7 +843,7 @@ "\n", "learn = text_classifier_learner(data_cls, AWD_LSTM, config=config, pretrained=False, **trn_args)\n", "#load pretrained finetuned model\n", - "learn.load_encoder('wisesight_enc')" + "learn.load_encoder(\"wisesight_enc\")" ] }, { @@ -909,7 +908,8 @@ "metadata": {}, "outputs": [], "source": [ - "learn.load('bestmodel');\n", + "learn.load(\"bestmodel\")\n", + "\n", "#get predictions\n", "probs, y_true, loss = learn.get_preds(ds_type = DatasetType.Valid, ordered=True, with_loss=True)\n", "classes = learn.data.train_ds.classes\n", @@ -938,9 +938,9 @@ "source": [ "to_df = np.concatenate([y_true[:,None],preds[:,None],loss[:,None],prob],1)\n", "probs_df = pd.DataFrame(to_df)\n", - "probs_df.columns = ['category','preds','loss'] + classes\n", - "probs_df['hit'] = (probs_df.category == probs_df.preds)\n", - "probs_df['texts'] = valid_df.texts\n", + "probs_df.columns = [\"category\",\"preds\",\"loss\"] + classes\n", + "probs_df[\"hit\"] = (probs_df.category == probs_df.preds)\n", + "probs_df[\"texts\"] = valid_df.texts\n", "(y_true==preds).mean()" ] }, @@ -967,10 +967,10 @@ "import seaborn as sns\n", "\n", "conf_mat = confusion_matrix(probs_df.category,probs_df.preds)\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + "sns.heatmap(conf_mat, annot=True, fmt=\"d\",\n", " xticklabels=classes, yticklabels=classes)\n", - "plt.ylabel('Actual')\n", - "plt.xlabel('Predicted')\n", + "plt.ylabel(\"Actual\")\n", + "plt.xlabel(\"Predicted\")\n", "plt.show()" ] } @@ -991,7 +991,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 7d873a9a1..21a18a9c1 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -1,6 +1,6 @@ ïŧŋ# -*- coding: utf-8 -*- -__version__ = 2.0 +__version__ = "2.0.2" thai_consonants = "āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ" # 44 chars thai_vowels = "āļĪāļĶāļ°\u0e31āļēāļģ\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39āđ€āđāđ‚āđƒāđ„\u0e45\u0e47" # 19 @@ -25,8 +25,8 @@ from pythainlp.soundex import soundex -from pythainlp.spell import spell +from pythainlp.spell import correct, spell from pythainlp.tag import pos_tag -from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize +from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize, Tokenizer from pythainlp.transliterate import romanize, transliterate from pythainlp.util import collate, thai_strftime diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index f41744dac..855215bc0 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -113,7 +113,7 @@ def download(name: str, force: bool = False): data_json = data.json() if name in list(data_json.keys()): temp_name = data_json[name] - print("Download : " + name) + print("Download: " + name) if not db.search(temp.name == name): print(name + " " + temp_name["version"]) diff --git a/pythainlp/soundex/__init__.py b/pythainlp/soundex/__init__.py index 30cfcd0a7..fac5f978d 100644 --- a/pythainlp/soundex/__init__.py +++ b/pythainlp/soundex/__init__.py @@ -12,7 +12,7 @@ # [KSS97] https://linux.thai.net/~thep/soundex/soundex.html -def soundex(text, engine="udom83"): +def soundex(text: str, engine="udom83") -> str: """ Thai Soundex diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index f7b21a764..e0dee6d6b 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -21,7 +21,7 @@ _RE_3 = re.compile(r"[āđ‡āđāļšāđ†āļŊ]") -def lk82(text): +def lk82(text: str) -> str: """ LK82 - It's a Thai soundex rule. diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index c5f7f8233..6998f81a9 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -20,7 +20,7 @@ _C8 = "āļ§" # W -> 8 -def metasound(text, length=4): +def metasound(text: str, length: int = 4) -> str: """ Thai MetaSound diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py index bf7ec5bba..dce60feaa 100644 --- a/pythainlp/soundex/udom83.py +++ b/pythainlp/soundex/udom83.py @@ -29,7 +29,7 @@ ) -def udom83(text): +def udom83(text: str) -> str: """ Udom83 - It's a Thai soundex rule. diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index cfd06682b..c4b654f53 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -3,11 +3,14 @@ Spell checking """ -from .pn import correct as pn_correct -from .pn import spell as pn_spell +from typing import List +from .pn import DEFAULT_SPELL_CHECKER, NorvigSpellChecker -def spell(word, engine="pn"): +__all__ = ["DEFAULT_SPELL_CHECKER", "correct", "spell", "NorvigSpellChecker"] + + +def spell(word: str, engine="pn") -> List[str]: """ :param str word: word to check spelling :param str engine: @@ -15,10 +18,10 @@ def spell(word, engine="pn"): :return: list of words """ - return pn_spell(word) + return DEFAULT_SPELL_CHECKER.spell(word) -def correct(word, engine="pn"): +def correct(word: str, engine="pn") -> str: """ :param str word: word to correct spelling :param str engine: @@ -26,4 +29,4 @@ def correct(word, engine="pn"): :return: the corrected word """ - return pn_correct(word) + return DEFAULT_SPELL_CHECKER.correct(word) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 84def66f3..ddce3d5c7 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -7,26 +7,33 @@ Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html """ from collections import Counter +from typing import Callable, List, Set, Tuple from pythainlp import thai_letters from pythainlp.corpus import tnc -from pythainlp.util import is_thaichar +from pythainlp.util import isthaichar -def _no_filter(word): +def _no_filter(word: str) -> bool: return True -def _is_thai_and_not_num(word): +def _is_thai_and_not_num(word: str) -> bool: for ch in word: - if ch != "." and not is_thaichar(ch): + if ch != "." and not isthaichar(ch): return False if ch in "āđāđ‘āđ’āđ“āđ”āđ•āđ–āđ—āđ˜āđ™0123456789": return False return True -def _keep(word_freq, min_freq, min_len, max_len, dict_filter): +def _keep( + word_freq: int, + min_freq: int, + min_len: int, + max_len: int, + dict_filter: Callable[[str], bool], +): """ Keep only Thai words with at least min_freq frequency and has length between min_len and max_len characters @@ -41,7 +48,7 @@ def _keep(word_freq, min_freq, min_len, max_len, dict_filter): return dict_filter(word) -def _edits1(word): +def _edits1(word: str) -> Set[str]: """ Return a set of words with edit distance of 1 from the input word """ @@ -54,7 +61,7 @@ def _edits1(word): return set(deletes + transposes + replaces + inserts) -def _edits2(word): +def _edits2(word: str) -> Set[str]: """ Return a set of words with edit distance of 2 from the input word """ @@ -64,11 +71,11 @@ def _edits2(word): class NorvigSpellChecker: def __init__( self, - custom_dict=None, - min_freq=2, - min_len=2, - max_len=40, - dict_filter=_is_thai_and_not_num, + custom_dict: List[Tuple[str, int]] = None, + min_freq: int = 2, + min_len: int = 2, + max_len: int = 40, + dict_filter: Callable[[str], bool] = _is_thai_and_not_num, ): """ Initialize Peter Norvig's spell checker object @@ -97,13 +104,13 @@ def __init__( if self.__WORDS_TOTAL < 1: self.__WORDS_TOTAL = 0 - def dictionary(self): + def dictionary(self) -> List[Tuple[str, int]]: """ Return the spelling dictionary currently used by this spell checker """ return self.__WORDS.items() - def known(self, words): + def known(self, words: List[str]) -> List[str]: """ Return a list of given words that found in the spelling dictionary @@ -111,7 +118,7 @@ def known(self, words): """ return list(w for w in words if w in self.__WORDS) - def prob(self, word): + def prob(self, word: str) -> float: """ Return probability of an input word, according to the spelling dictionary @@ -119,7 +126,7 @@ def prob(self, word): """ return self.__WORDS[word] / self.__WORDS_TOTAL - def freq(self, word): + def freq(self, word: str) -> int: """ Return frequency of an input word, according to the spelling dictionary @@ -127,7 +134,7 @@ def freq(self, word): """ return self.__WORDS[word] - def spell(self, word): + def spell(self, word: str) -> List[str]: """ Return a list of possible words, according to edit distance of 1 and 2, sorted by frequency of word occurrance in the spelling dictionary @@ -147,7 +154,7 @@ def spell(self, word): return candidates - def correct(self, word): + def correct(self, word: str) -> str: """ Return the most possible word, using the probability from the spelling dictionary @@ -160,49 +167,3 @@ def correct(self, word): DEFAULT_SPELL_CHECKER = NorvigSpellChecker() - - -def dictionary(): - """ - Return the spelling dictionary currently used by this spell checker. - The spelling dictionary is based on words found in the Thai National Corpus. - """ - return DEFAULT_SPELL_CHECKER.dictionary() - - -def known(words): - """ - Return a list of given words that found in the spelling dictionary. - The spelling dictionary is based on words found in the Thai National Corpus. - - :param str words: A list of words to check if they are in the spelling dictionary - """ - return DEFAULT_SPELL_CHECKER.known(words) - - -def prob(word): - """ - Return probability of an input word, according to the Thai National Corpus - - :param str word: A word to check its probability of occurrence - """ - return DEFAULT_SPELL_CHECKER.prob(word) - - -def spell(word): - """ - Return a list of possible words, according to edit distance of 1 and 2, - sorted by probability of word occurrance in the Thai National Corpus. - - :param str word: A word to check its spelling - """ - return DEFAULT_SPELL_CHECKER.spell(word) - - -def correct(word): - """ - Return the most possible word, according to probability from the Thai National Corpus - - :param str word: A word to correct its spelling - """ - return DEFAULT_SPELL_CHECKER.correct(word) diff --git a/pythainlp/summarize/freq.py b/pythainlp/summarize/freq.py index c7bc25ff9..2dc7044fd 100644 --- a/pythainlp/summarize/freq.py +++ b/pythainlp/summarize/freq.py @@ -33,10 +33,10 @@ def __compute_frequencies(self, word_tokenized_sents): return word_freqs - def __rank(self, ranking, n): + def __rank(self, ranking, n: int): return nlargest(n, ranking, key=ranking.get) - def summarize(self, text, n, tokenizer): + def summarize(self, text: str, n: int, tokenizer: str): sents = sent_tokenize(text) word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents] self.__freq = self.__compute_frequencies(word_tokenized_sents) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 9b0232b78..6f788aaf0 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -4,31 +4,29 @@ such as its part of speech and class of named-entity. """ -__all__ = [ - "pos_tag", - "pos_tag_sents", - "tag_provinces" -] +from typing import List, Tuple + +__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"] from .locations import tag_provinces # tag map for orchid to Universal Dependencies -# from Korakot Chaovavanich +# from Korakot Chaovavanich _TAG_MAP_UD = { - #NOUN - "NOUN":"NOUN", - "NCMN":"NOUN", - "NTTL":"NOUN", - "CNIT":"NOUN", - "CLTV":"NOUN", - "CMTR":"NOUN", - "CFQC":"NOUN", - "CVBL":"NOUN", + # NOUN + "NOUN": "NOUN", + "NCMN": "NOUN", + "NTTL": "NOUN", + "CNIT": "NOUN", + "CLTV": "NOUN", + "CMTR": "NOUN", + "CFQC": "NOUN", + "CVBL": "NOUN", # VERB - "VACT":"VERB", - "VSTA":"VERB", - #PRON - "PRON":"PRON", - "NPRP":"PRON", + "VACT": "VERB", + "VSTA": "VERB", + # PRON + "PRON": "PRON", + "NPRP": "PRON", # ADJ "ADJ": "ADJ", "NONM": "ADJ", @@ -40,13 +38,13 @@ "ADVI": "ADV", "ADVP": "ADV", "ADVS": "ADV", - # INT + # INT "INT": "INTJ", # PRON - "PROPN":"PROPN", - "PPRS":"PROPN", - "PDMN":"PROPN", - "PNTR":"PROPN", + "PROPN": "PROPN", + "PPRS": "PROPN", + "PDMN": "PROPN", + "PNTR": "PROPN", # DET "DET": "DET", "DDAN": "DET", @@ -56,57 +54,74 @@ "DIAC": "DET", "DIBQ": "DET", "DIAQ": "DET", - "DCNM": "DET", # NUM "NUM": "NUM", "NCNM": "NUM", "NLBL": "NUM", "DCNM": "NUM", - # AUX + # AUX "AUX": "AUX", "XVBM": "AUX", "XVAM": "AUX", "XVMM": "AUX", "XVBB": "AUX", "XVAE": "AUX", - # ADP + # ADP "ADP": "ADP", "RPRE": "ADP", # CCONJ - "CCONJ":"CCONJ", - "JCRG":"CCONJ", - # SCONJ - "SCONJ":"SCONJ", - "PREL":"SCONJ", - "JSBR":"SCONJ", - "JCMP":"SCONJ", + "CCONJ": "CCONJ", + "JCRG": "CCONJ", + # SCONJ + "SCONJ": "SCONJ", + "PREL": "SCONJ", + "JSBR": "SCONJ", + "JCMP": "SCONJ", # PART - "PART":"PART", - "FIXN":"PART", - "FIXV":"PART", - "EAFF":"PART", - "EITT":"PART", - "AITT":"PART", - "NEG":"PART", + "PART": "PART", + "FIXN": "PART", + "FIXV": "PART", + "EAFF": "PART", + "EITT": "PART", + "AITT": "PART", + "NEG": "PART", # PUNCT - "PUNCT":"PUNCT", - "PUNC":"PUNCT" + "PUNCT": "PUNCT", + "PUNC": "PUNCT", } -def _UD_Exception(w,tag): - if w=="āļāļēāļĢ" or w=="āļ„āļ§āļēāļĄ": - return "NOUN" - return tag -def _orchid_to_ud(tag): - _i=0 - temp=[] - while _i str: + if w == "āļāļēāļĢ" or w == "āļ„āļ§āļēāļĄ": + return "NOUN" + + return tag + + +def _orchid_to_ud(tag) -> List[Tuple[str, str]]: + _i = 0 + temp = [] + while _i < len(tag): + temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))) + _i += 1 + + return temp + + +def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]: + if not words: + return [] + + from artagger import Tagger -def pos_tag(words, engine="perceptron", corpus="orchid"): + words_ = Tagger().tag(" ".join(words)) + + return [(word.word, word.tag) for word in words_] + + +def pos_tag( + words: List[str], engine: str = "perceptron", corpus: str = "orchid" +) -> List[Tuple[str, str]]: """ Part of Speech tagging function. @@ -121,41 +136,36 @@ def pos_tag(words, engine="perceptron", corpus="orchid"): * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ - _corpus=corpus - _tag=[] - if corpus=="orchid_ud": - corpus="orchid" + _corpus = corpus + _tag = [] + if corpus == "orchid_ud": + corpus = "orchid" if not words: return [] if engine == "perceptron": from .perceptron import tag as tag_ elif engine == "artagger": - - def tag_(words, corpus=None): - if not words: - return [] - - from artagger import Tagger - words_ = Tagger().tag(" ".join(words)) - - return [(word.word, word.tag) for word in words_] - + tag_ = _artagger_tag else: # default, use "unigram" ("old") engine from .unigram import tag as tag_ - _tag= tag_(words, corpus=corpus) - if _corpus=="orchid_ud": - _tag=_orchid_to_ud(_tag) + _tag = tag_(words, corpus=corpus) + + if _corpus == "orchid_ud": + _tag = _orchid_to_ud(_tag) + return _tag -def pos_tag_sents(sentences, engine="perceptron", corpus="orchid"): +def pos_tag_sents( + sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid" +) -> List[List[Tuple[str, str]]]: """ Part of Speech tagging Sentence function. - :param list sentences: a list of tokenized sentences (a list of tokenized words in sentences) + :param list sentences: a list of lists of tokenized words :param str engine: - * unigram - unigram tagger + * unigram - unigram tagger * perceptron - perceptron tagger (default) * artagger - RDR POS tagger :param str corpus: diff --git a/pythainlp/tag/locations.py b/pythainlp/tag/locations.py index 01bf3060c..74fb96e5d 100644 --- a/pythainlp/tag/locations.py +++ b/pythainlp/tag/locations.py @@ -3,10 +3,12 @@ Recognizes locations in text """ +from typing import List, Tuple + from pythainlp.corpus import provinces -def tag_provinces(tokens): +def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]: """ Recognize Thailand provinces in text diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index a1236d171..dca5d18b8 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -5,20 +5,22 @@ __all__ = ["ThaiNameTagger"] +from typing import List, Tuple, Union + import sklearn_crfsuite from pythainlp.corpus import download, get_corpus_path, thai_stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize -from pythainlp.util import is_thaiword +from pythainlp.util import isthai _WORD_TOKENIZER = "newmm" # āļ•āļąāļ§āļ•āļąāļ”āļ„āļģ -def _is_stopword(word): # āđ€āļŠāđ‡āļ„āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļ„āļģāļŸāļļāđˆāļĄāđ€āļŸāļ·āļ­āļĒ +def _is_stopword(word: str) -> bool: # āđ€āļŠāđ‡āļ„āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļ„āļģāļŸāļļāđˆāļĄāđ€āļŸāļ·āļ­āļĒ return word in thai_stopwords() -def _doc2features(doc, i): +def _doc2features(doc, i) -> dict: word = doc[i][0] postag = doc[i][1] @@ -26,7 +28,7 @@ def _doc2features(doc, i): features = { "word.word": word, "word.stopword": _is_stopword(word), - "word.isthai": is_thaiword(word), + "word.isthai": isthai(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit": word.isdigit(), @@ -41,7 +43,7 @@ def _doc2features(doc, i): prev_features = { "word.prevword": prevword, "word.previsspace": prevword.isspace(), - "word.previsthai": is_thaiword(prevword), + "word.previsthai": isthai(prevword), "word.prevstopword": _is_stopword(prevword), "word.prevpostag": prevpostag, "word.prevwordisdigit": prevword.isdigit(), @@ -58,7 +60,7 @@ def _doc2features(doc, i): "word.nextword": nextword, "word.nextisspace": nextword.isspace(), "word.nextpostag": nextpostag, - "word.nextisthai": is_thaiword(nextword), + "word.nextisthai": isthai(nextword), "word.nextstopword": _is_stopword(nextword), "word.nextwordisdigit": nextword.isdigit(), } @@ -87,7 +89,9 @@ def __init__(self): model_filename=self.__data_path, ) - def get_ner(self, text, pos=True): + def get_ner( + self, text: str, pos: bool = True + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ Get named-entities in text @@ -101,10 +105,11 @@ def get_ner(self, text, pos=True): >>> ner = ThaiNameTagger() >>> ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.") [('āļ§āļąāļ™āļ—āļĩāđˆ', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'), - (' ', 'PUNCT', 'I-DATE'), ('āļ.āļĒ.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), - ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'), - ('āļĢāļ°āļšāļš', 'NOUN', 'O'), ('āđ€āļ§āļĨāļē', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), - ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), + (' ', 'PUNCT', 'I-DATE'), ('āļ.āļĒ.', 'NOUN', 'I-DATE'), + (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'), + (' ', 'PUNCT', 'O'), ('āļ—āļ”āļŠāļ­āļš', 'VERB', 'O'), + ('āļĢāļ°āļšāļš', 'NOUN', 'O'), ('āđ€āļ§āļĨāļē', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), + ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('āļ™.', 'NOUN', 'I-TIME')] >>> ner.get_ner("āļ§āļąāļ™āļ—āļĩāđˆ 15 āļ.āļĒ. 61 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 14:49 āļ™.", pos=False) [('āļ§āļąāļ™āļ—āļĩāđˆ', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), @@ -113,7 +118,9 @@ def get_ner(self, text, pos=True): (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('āļ™.', 'I-TIME')] """ self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER) - self.__pos_tags = pos_tag(self.__tokens,engine="perceptron", corpus="orchid_ud") + self.__pos_tags = pos_tag( + self.__tokens, engine="perceptron", corpus="orchid_ud" + ) self.__x_test = self.__extract_features(self.__pos_tags) self.__y = self.crf.predict_single(self.__x_test) diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 4032df759..ccff12427 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -3,6 +3,7 @@ Perceptron Part-Of-Speech tagger """ import os +from typing import List, Tuple import dill from pythainlp.corpus import corpus_path @@ -22,127 +23,124 @@ def _load_tagger(filename): _PUD_TAGGER = _load_tagger(_PUD_DATA_FILENAME) -def tag(words, corpus="pud"): +def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: """ āļĢāļąāļšāļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āļ„āļ·āļ™āļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āđ€āļŠāđˆāļ™ [('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ...] """ if not words: return [] - # perceptron tagger cannot handle empty string - #words = [word.strip() for word in words if word.strip()] - if corpus == "orchid": tagger = _ORCHID_TAGGER - i=0 - while i': - words[i]="" - elif words[i]=='=': - words[i]="" - elif words[i]=='!': - words[i]="" - elif words[i]=='’': - words[i]="" - elif words[i]==':': - words[i]="" - elif words[i]=='*': - words[i]="" - elif words[i]==';': - words[i]="" - elif words[i]=='/': - words[i]="" - i+=1 - t2=tagger.tag(words) - t=[] - i=0 - while i" or word=='': - word=" " - elif word=="": - word="+" - elif word=="": - word="-" - elif word=="": - word="=" - elif word=="": - word="," - elif word=="": - word="$" - elif word=="": - word="." - elif word=="": - word="(" - elif word=="": - word=")" - elif word=="": - word='"' - elif word=="": - word='@' - elif word=="": - word='&' - elif word=="": - word='{' - elif word=="": - word='^' - elif word=="": - word='?' - elif word=="": - word='<' - elif word=="": - word='>' - elif word=="": - word='=' - elif word=="": - word='!' - elif word=="": - word='’' - elif word=="": - word=':' - elif word=="": - word='*' - elif word=="": - word=';' - elif word=="": - word='/' - t.append((word,tag)) - i+=1 - #t=temp + i = 0 + while i < len(words): + if words[i] == " ": + words[i] = "" + elif words[i] == "+": + words[i] = "" + elif words[i] == "-": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == ",": + words[i] = "" + elif words[i] == "$": + words[i] = "" + elif words[i] == ".": + words[i] = "" + elif words[i] == "(": + words[i] = "" + elif words[i] == ")": + words[i] = "" + elif words[i] == '"': + words[i] = "" + elif words[i] == "@": + words[i] = "" + elif words[i] == "&": + words[i] = "" + elif words[i] == "{": + words[i] = "" + elif words[i] == "^": + words[i] = "" + elif words[i] == "?": + words[i] = "" + elif words[i] == "<": + words[i] = "" + elif words[i] == ">": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == "!": + words[i] = "" + elif words[i] == "’": + words[i] = "" + elif words[i] == ":": + words[i] = "" + elif words[i] == "*": + words[i] = "" + elif words[i] == ";": + words[i] = "" + elif words[i] == "/": + words[i] = "" + i += 1 + t2 = tagger.tag(words) + t = [] + i = 0 + while i < len(t2): + word = t2[i][0] + tag = t2[i][1] + if word == "": + word = " " + elif word == "": + word = "+" + elif word == "": + word = "-" + elif word == "": + word = "=" + elif word == "": + word = "," + elif word == "": + word = "$" + elif word == "": + word = "." + elif word == "": + word = "(" + elif word == "": + word = ")" + elif word == "": + word = '"' + elif word == "": + word = "@" + elif word == "": + word = "&" + elif word == "": + word = "{" + elif word == "": + word = "^" + elif word == "": + word = "?" + elif word == "": + word = "<" + elif word == "": + word = ">" + elif word == "": + word = "=" + elif word == "": + word = "!" + elif word == "": + word = "’" + elif word == "": + word = ":" + elif word == "": + word = "*" + elif word == "": + word = ";" + elif word == "": + word = "/" + t.append((word, tag)) + i += 1 else: # default, use "pud" as a corpus tagger = _PUD_TAGGER - t=tagger.tag(words) + t = tagger.tag(words) + return t diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index 863323a1f..ece6e3028 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -4,6 +4,7 @@ """ import json import os +from typing import List, Tuple import dill import nltk.tag @@ -27,7 +28,7 @@ def _pud_tagger(): return model -def tag(words, corpus): +def tag(words: List[str], corpus: str) -> List[Tuple[str, str]]: """ āļĢāļąāļšāļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āļ„āļ·āļ™āļ„āđˆāļēāđ€āļ›āđ‡āļ™ ''list'' āđ€āļŠāđˆāļ™ [('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ('āļ„āļģ', 'āļŠāļ™āļīāļ”āļ„āļģ'), ...] """ @@ -36,116 +37,116 @@ def tag(words, corpus): if corpus == "orchid": tagger = nltk.tag.UnigramTagger(model=_orchid_tagger()) - i=0 - while i': - words[i]="" - elif words[i]=='=': - words[i]="" - elif words[i]=='!': - words[i]="" - elif words[i]=='’': - words[i]="" - elif words[i]==':': - words[i]="" - elif words[i]=='*': - words[i]="" - elif words[i]==';': - words[i]="" - elif words[i]=='/': - words[i]="" - i+=1 - t=tagger.tag(words) - temp=[] - i=0 - while i": - word=" " - elif word=="": - word="+" - elif word=="": - word="-" - elif word=="": - word="=" - elif word=="": - word="," - elif word=="": - word="$" - elif word=="": - word="." - elif word=="": - word="(" - elif word=="": - word=")" - elif word=="": - word='"' - elif word=="": - word='@' - elif word=="": - word='&' - elif word=="": - word='{' - elif word=="": - word='^' - elif word=="": - word='?' - elif word=="": - word='<' - elif word=="": - word='>' - elif word=="": - word='=' - elif word=="": - word='!' - elif word=="": - word='’' - elif word=="": - word=':' - elif word=="": - word='*' - elif word=="": - word=';' - elif word=="": - word='/' - temp.append((word,tag)) - i+=1 - t=temp + i = 0 + while i < len(words): + if words[i] == " ": + words[i] = "" + elif words[i] == "+": + words[i] = "" + elif words[i] == "-": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == ",": + words[i] = "" + elif words[i] == "$": + words[i] = "" + elif words[i] == ".": + words[i] = "" + elif words[i] == "(": + words[i] = "" + elif words[i] == ")": + words[i] = "" + elif words[i] == '"': + words[i] = "" + elif words[i] == "@": + words[i] = "" + elif words[i] == "&": + words[i] = "" + elif words[i] == "{": + words[i] = "" + elif words[i] == "^": + words[i] = "" + elif words[i] == "?": + words[i] = "" + elif words[i] == "<": + words[i] = "" + elif words[i] == ">": + words[i] = "" + elif words[i] == "=": + words[i] = "" + elif words[i] == "!": + words[i] = "" + elif words[i] == "’": + words[i] = "" + elif words[i] == ":": + words[i] = "" + elif words[i] == "*": + words[i] = "" + elif words[i] == ";": + words[i] = "" + elif words[i] == "/": + words[i] = "" + i += 1 + t = tagger.tag(words) + temp = [] + i = 0 + while i < len(t): + word = t[i][0] + tag = t[i][1] + if word == "": + word = " " + elif word == "": + word = "+" + elif word == "": + word = "-" + elif word == "": + word = "=" + elif word == "": + word = "," + elif word == "": + word = "$" + elif word == "": + word = "." + elif word == "": + word = "(" + elif word == "": + word = ")" + elif word == "": + word = '"' + elif word == "": + word = "@" + elif word == "": + word = "&" + elif word == "": + word = "{" + elif word == "": + word = "^" + elif word == "": + word = "?" + elif word == "": + word = "<" + elif word == "": + word = ">" + elif word == "": + word = "=" + elif word == "": + word = "!" + elif word == "": + word = "’" + elif word == "": + word = ":" + elif word == "": + word = "*" + elif word == "": + word = ";" + elif word == "": + word = "/" + temp.append((word, tag)) + i += 1 + t = temp else: tagger = _pud_tagger() - t=tagger.tag(words) + t = tagger.tag(words) return t diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index d3c9bb1d5..b87cf13e5 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -3,6 +3,8 @@ Thai tokenizers """ import re +from typing import Iterable, List, Union + from pythainlp.corpus import get_corpus, thai_syllables, thai_words from marisa_trie import Trie @@ -11,11 +13,13 @@ FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt")) -def word_tokenize(text, engine="newmm", whitespaces=True): +def word_tokenize( + text: str, engine: str = "newmm", whitespaces: bool = True +) -> List[str]: """ :param str text: text to be tokenized :param str engine: tokenizer to be used - :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai + :param bool whitespaces: True to output no whitespace, a common mark of end of phrase in Thai :Parameters for engine: * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster * longest - dictionary-based, Longest Matching @@ -60,7 +64,9 @@ def segment(text): return segment(text) -def dict_word_tokenize(text, custom_dict, engine="newmm"): +def dict_word_tokenize( + text: str, custom_dict: Trie, engine: str = "newmm" +) -> List[str]: """ :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized @@ -90,7 +96,7 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"): return segment(text, custom_dict) -def sent_tokenize(text, engine="whitespace+newline"): +def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]: """ This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found. @@ -106,28 +112,36 @@ def sent_tokenize(text, engine="whitespace+newline"): sentences = [] if engine == "whitespace": - sentences = re.split(r' +', text, re.U) + sentences = re.split(r" +", text, re.U) else: # default, use whitespace + newline sentences = text.split() return sentences -def subword_tokenize(text, engine="tcc"): +def subword_tokenize(text: str, engine: str = "tcc") -> List[str]: """ :param str text: text to be tokenized - :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. + :param str engine: subword tokenizer + :Parameters for engine: + * tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000) + * etcc - Enhanced Thai Character Cluster (Inrut et al. 2001) [In development] :return: a list of tokenized strings. """ if not text: return "" from .tcc import tcc + from .etcc import etcc + if engine == "tcc": + return tcc(text) + elif engine == "etcc": + return etcc(text).split("/") + #default return tcc(text) - -def syllable_tokenize(text): +def syllable_tokenize(text: str) -> List[str]: """ :param str text: input string to be tokenized @@ -147,7 +161,7 @@ def syllable_tokenize(text): return tokens -def dict_trie(dict_source): +def dict_trie(dict_source: Union[str, Iterable]) -> Trie: """ Create a dict trie which will be used for word_tokenize() function. For more information on the trie data structure, @@ -162,17 +176,19 @@ def dict_trie(dict_source): with open(dict_source, "r", encoding="utf8") as f: _vocabs = f.read().splitlines() return Trie(_vocabs) - elif isinstance(dict_source, (list, tuple, set, frozenset)): + elif isinstance(dict_source, Iterable): # Received a sequence type object of vocabs return Trie(dict_source) else: raise TypeError( - "Type of dict_source must be either str (path to source file) or collections" + "Type of dict_source must be either str (path to source file) or iterable" ) class Tokenizer: - def __init__(self, custom_dict=None,tokenize_engine="newmm"): + def __init__( + self, custom_dict: Union[str, Iterable] = None, tokenize_engine: str = "newmm" + ): """ Initialize tokenizer object @@ -180,20 +196,24 @@ def __init__(self, custom_dict=None,tokenize_engine="newmm"): :param str tokenize_engine: choose between different options of engine to token (newmm, mm, longest) """ self.__trie_dict = None - self.word_engine=tokenize_engine + self.word_engine = tokenize_engine if custom_dict: self.__trie_dict = dict_trie(custom_dict) else: self.__trie_dict = dict_trie(thai_words()) - def word_tokenize(self, text): + + def word_tokenize(self, text: str) -> List[str]: """ :param str text: text to be tokenized :return: list of words, tokenized from the text """ - return dict_word_tokenize(text,custom_dict=self.__trie_dict,engine=self.word_engine) - def set_tokenize_engine(self,name_engine): + return dict_word_tokenize( + text, custom_dict=self.__trie_dict, engine=self.word_engine + ) + + def set_tokenize_engine(self, name_engine: str) -> None: """ :param str name_engine: choose between different options of engine to token (newmm, mm, longest) """ - self.word_engine=name_engine \ No newline at end of file + self.word_engine = name_engine diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 395e76583..a3844c2f3 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -3,8 +3,10 @@ Wrapper for deepcut Thai word segmentation """ +from typing import List + import deepcut -def segment(text): +def segment(text: str) -> List[str]: return deepcut.tokenize(text) diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index dbe04122a..1df6eaaec 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -3,6 +3,8 @@ āđ‚āļ›āļĢāđāļāļĢāļĄ ETCC āđƒāļ™ Python āļžāļąāļ’āļ™āļēāđ‚āļ”āļĒ āļ™āļēāļĒ āļ§āļĢāļĢāļ“āļžāļ‡āļĐāđŒ āļ āļąāļ—āļ—āļīāļĒāđ„āļžāļšāļđāļĨāļĒāđŒ 19 āļĄāļī.āļĒ. 2560 +Reference: Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. "Thai word segmentation using combination of forward and backward longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. + āļ§āļīāļ˜āļĩāđƒāļŠāđ‰āļ‡āļēāļ™ etcc(āļ„āļģ) @@ -20,7 +22,7 @@ _UV2 = "[" + "".join(["āļą", "āļ·"]) + "]" -def etcc(text): +def etcc(text: str) -> str: """ Enhanced Thai Character Cluster (ETCC) diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 33ff1fa0a..83ce495a1 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -39,14 +39,13 @@ class LongestMatchTokenizer(object): def __init__(self, trie): self.__trie = trie - def __search_nonthai(self, text): + def __search_nonthai(self, text: str): match = _RE_NONTHAI.search(text) if match.group(0): return match.group(0).lower() - else: - return None + return None - def __is_next_word_valid(self, text, begin_pos): + def __is_next_word_valid(self, text: str, begin_pos: int) -> bool: len_text = len(text) text = text[begin_pos:len_text].strip() @@ -63,7 +62,7 @@ def __is_next_word_valid(self, text, begin_pos): return False - def __longest_matching(self, text, begin_pos): + def __longest_matching(self, text: str, begin_pos: int): len_text = len(text) text = text[begin_pos:len_text] @@ -94,7 +93,7 @@ def __longest_matching(self, text, begin_pos): else: return "" - def __segment_text(self, text): + def __segment_text(self, text: str): if not text: return [] diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 17815fd9f..066ff1017 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -9,6 +9,7 @@ import re from collections import defaultdict from heapq import heappop, heappush # for priority queue +from typing import List from pythainlp.tokenize import DEFAULT_DICT_TRIE @@ -38,7 +39,7 @@ def bfs_paths_graph(graph, start, goal): queue.append((next, path + [next])) -def onecut(text, trie): +def onecut(text: str, trie): graph = defaultdict(list) # main data structure allow_pos = tcc_pos(text) # āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆāļ•āļąāļ” āļ•āđ‰āļ­āļ‡āļ•āļĢāļ‡āļāļąāļš tcc @@ -90,7 +91,7 @@ def onecut(text, trie): # āļŠāđˆāļ§āļĒāđƒāļŦāđ‰āđ„āļĄāđˆāļ•āđ‰āļ­āļ‡āļžāļīāļĄāļžāđŒāļĒāļēāļ§āđ† -def segment(text, trie=None): +def segment(text: str, trie=None) -> List[str]: if not text: return [] diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 23b7b38e4..33fc0aabc 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -3,11 +3,12 @@ Wrapper for ICU word segmentation """ import re +from typing import List from icu import BreakIterator, Locale -def _gen_words(text): +def _gen_words(text: str) -> str: bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() @@ -16,7 +17,7 @@ def _gen_words(text): p = q -def segment(text): +def segment(text: str) -> List[str]: if not text: return [] diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index b50bdb24a..ee945e929 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -1,8 +1,9 @@ ïŧŋ# -*- coding: utf-8 -*- """ Separate Thai text into Thai Character Cluster (TCC). -Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002) -http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548 +Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2000) +https://dl.acm.org/citation.cfm?id=355225 +http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548 Credits: - TCC: Jakkrit TeCho @@ -10,6 +11,7 @@ - Python code: Korakot Chaovavanich """ import re +from typing import List, Set RE_TCC = ( """\ @@ -47,9 +49,9 @@ PAT_TCC = re.compile("|".join(RE_TCC)) -def tcc_gen(w): +def tcc_gen(w: str) -> str: if not w: - return '' + return "" p = 0 while p < len(w): @@ -62,7 +64,7 @@ def tcc_gen(w): p += n -def tcc_pos(text): +def tcc_pos(text: str) -> Set[int]: if not text: return set() @@ -75,8 +77,5 @@ def tcc_pos(text): return p_set -def tcc(text, sep="/"): - if not text: - return "" - - return sep.join(tcc_gen(text)) +def tcc(text: str) -> List[str]: + return list(tcc_gen(text)) diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index e2487e582..5f7a5a5cb 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -5,19 +5,20 @@ For text processing and text conversion, see pythainlp.util """ import os -import sys + import pythainlp PYTHAINLP_DATA_DIR = "pythainlp-data" -def get_full_data_path(path): + +def get_full_data_path(path: str) -> str: """ Get filename/path of a dataset, return full path of that filename/path """ return os.path.join(get_pythainlp_data_path(), path) -def get_pythainlp_data_path(): +def get_pythainlp_data_path() -> str: """ Return full path where PyThaiNLP keeps its (downloaded) data """ @@ -27,7 +28,7 @@ def get_pythainlp_data_path(): return path -def get_pythainlp_path(): +def get_pythainlp_path() -> str: """ Return full path of PyThaiNLP code """ diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index df96b0360..91435cc54 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -3,15 +3,15 @@ from pythainlp.tokenize import word_tokenize -# āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ -def romanize(text, engine="royin"): +def romanize(text: str, engine: str = "royin") -> str: """ + āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ :param str text: Thai text to be romanized :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). :return: English (more or less) text that spells out how the Thai text should read. """ - if isinstance(text,str)==False: + if not isinstance(text, str) or not text: return "" if engine == "thai2rom": @@ -21,22 +21,20 @@ def romanize(text, engine="royin"): else: # use default engine "royin" from .royin import romanize - try: - words = word_tokenize(text) - romanized_words = [romanize(word) for word in words] - except: - romanized_words =[romanize(text)] + words = word_tokenize(text) + romanized_words = [romanize(word) for word in words] + return "".join(romanized_words) -def transliterate(text, engine="ipa"): +def transliterate(text: str, engine: str = "ipa") -> str: """ :param str text: Thai text to be transliterated :param str engine: 'ipa' (default) or 'pyicu'. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. """ - if not text: + if not isinstance(text, str) or not text: return "" if engine == "pyicu": diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py index 5fe18d24d..be7c1e1c6 100644 --- a/pythainlp/transliterate/ipa.py +++ b/pythainlp/transliterate/ipa.py @@ -7,7 +7,7 @@ _EPI_THA = epitran.Epitran("tha-Thai") -def transliterate(text): +def transliterate(text: str) -> str: return _EPI_THA.transliterate(text) diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py index e34be0e16..5e4a755aa 100644 --- a/pythainlp/transliterate/pyicu.py +++ b/pythainlp/transliterate/pyicu.py @@ -6,7 +6,7 @@ # āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ -def transliterate(text): +def transliterate(text: str) -> str: """ āļ–āļ­āļ”āđ€āļŠāļĩāļĒāļ‡āļ āļēāļĐāļēāđ„āļ—āļĒāđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ āļĢāļąāļšāļ„āđˆāļē ''str'' āļ‚āđ‰āļ­āļ„āļ§āļēāļĄ āļ„āļ·āļ™āļ„āđˆāļē ''str'' āļ­āļąāļāļĐāļĢāļĨāļ°āļ•āļīāļ™ """ diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py index 62e44783b..d6f6f71c8 100644 --- a/pythainlp/transliterate/royin.py +++ b/pythainlp/transliterate/royin.py @@ -117,20 +117,20 @@ ) -def _normalize(text): +def _normalize(text: str) -> str: """āļ•āļąāļ”āļ­āļąāļāļĐāļĢāļ—āļĩāđˆāđ„āļĄāđˆāļ­āļ­āļāđ€āļŠāļĩāļĒāļ‡ (āļāļēāļĢāļąāļ™āļ•āđŒ āđ„āļ›āļĒāļēāļĨāļ™āđ‰āļ­āļĒ āđ„āļĄāđ‰āļĒāļĄāļ*) āđāļĨāļ°āļ§āļĢāļĢāļ“āļĒāļļāļāļ•āđŒāļ—āļīāđ‰āļ‡""" return _RE_NORMALIZE.sub("", text) -def _replace_vowels(word): +def _replace_vowels(word: str) -> str: for vowel in _VOWELS: word = re.sub(vowel[0], vowel[1], word) return word -def _replace_consonants(word, res): - if res is None: +def _replace_consonants(word: str, res: str) -> str: + if not res: pass elif len(res) == 1: word = word.replace(res[0], _CONSONANTS[res[0]][0]) @@ -162,9 +162,10 @@ def _replace_consonants(word, res): return word -def romanize(word): - if isinstance(word,str)==False: +def romanize(word: str) -> str: + if not isinstance(word, str) or not word: return "" + word2 = _replace_vowels(_normalize(word)) res = _RE_CONSONANT.findall(word2) @@ -175,5 +176,5 @@ def romanize(word): word2 = "".join(word2) word2 = _replace_consonants(word2, res) - + return word2 \ No newline at end of file diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py index 49a498d83..1dc5a5267 100644 --- a/pythainlp/transliterate/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -157,5 +157,5 @@ def romanize(self, text): _THAI_TO_ROM = ThaiTransliterator() -def romanize(text): +def romanize(text: str) -> str: return _THAI_TO_ROM.romanize(text) diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index ab56c81ce..00c9f8891 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -35,8 +35,9 @@ _MODEL_NAME_LSTM = "wiki_lm_lstm" _ITOS_NAME_LSTM = "wiki_itos_lstm" + # Download pretrained models -def _get_path(fname): +def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name @@ -56,7 +57,7 @@ class ThaiTokenizer(BaseTokenizer): https://docs.fast.ai/text.transform#BaseTokenizer """ - def __init__(self, lang = "th"): + def __init__(self, lang="th"): self.lang = lang def tokenizer(self, t): @@ -94,6 +95,7 @@ def rm_brackets(t): new_line = re.sub(r"\[\]", "", new_line) return new_line + def ungroup_emoji(toks): "Ungroup emojis" res = [] @@ -105,6 +107,7 @@ def ungroup_emoji(toks): res.append(tok) return res + def lowercase_all(toks): "lowercase all English words" return [tok.lower() for tok in toks] @@ -112,17 +115,26 @@ def lowercase_all(toks): # Pretrained paths # TODO: Let the user decide if they like to download (at setup?) -_THWIKI_LSTM = dict(wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM)) +_THWIKI_LSTM = dict( + wgts_fname=_get_path(_MODEL_NAME_LSTM), itos_fname=_get_path(_ITOS_NAME_LSTM) +) # Preprocessing rules for Thai text -pre_rules_th = [fix_html, replace_rep_after, normalize_char_order, - spec_add_spaces, rm_useless_spaces, rm_useless_newlines, rm_brackets] +pre_rules_th = [ + fix_html, + replace_rep_after, + normalize_char_order, + spec_add_spaces, + rm_useless_spaces, + rm_useless_newlines, + rm_brackets, +] post_rules_th = [replace_all_caps, ungroup_emoji, lowercase_all] _tokenizer = ThaiTokenizer() -def document_vector(text, learn, data, agg='mean'): +def document_vector(text, learn, data, agg="mean"): """ :meth: `document_vector` get document vector using fastai language model and data bunch :param str text: text to extract embeddings @@ -131,18 +143,18 @@ def document_vector(text, learn, data, agg='mean'): :param agg: how to aggregate embeddings :return: `numpy.array` of document vector sized 400 based on the encoder of the model """ - + s = _tokenizer.tokenizer(text) t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(device) m = learn.model[0].encoder.to(device) res = m(t).cpu().detach().numpy() - if agg == 'mean': + if agg == "mean": res = res.mean(0) - elif agg == 'sum': + elif agg == "sum": res = res.sum(0) else: - raise ValueError('Aggregate by mean or sum') - return(res) + raise ValueError("Aggregate by mean or sum") + return res def merge_wgts(em_sz, wgts, itos_pre, itos_new): diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index b7e194436..6a4ff0ce6 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -11,9 +11,9 @@ "digit_to_text", "eng_to_thai", "find_keyword", - "is_thai", - "is_thaichar", - "is_thaiword", + "countthai", + "isthai", + "isthaichar", "normalize", "now_reign_year", "num_to_thaiword", @@ -42,6 +42,6 @@ from .keywords import find_keyword, rank from .normalize import deletetone, normalize from .numtoword import bahttext, num_to_thaiword -from .thai import is_thai, is_thaichar, is_thaiword +from .thai import countthai, isthai, isthaichar +from .thaiwordcheck import thaicheck from .wordtonum import thaiword_to_num -from .thaiwordcheck import thaicheck \ No newline at end of file diff --git a/pythainlp/util/collate.py b/pythainlp/util/collate.py index bc35c2fe9..ffaff4998 100644 --- a/pythainlp/util/collate.py +++ b/pythainlp/util/collate.py @@ -4,25 +4,27 @@ Simple implementation using regular expressions """ import re +from typing import Iterable, List _RE_TONE = re.compile(r"[āđ‡-āđŒ]") _RE_LV_C = re.compile(r"([āđ€-āđ„])([āļ-āļŪ])") -def _thkey(word): +def _thkey(word: str) -> str: cv = _RE_TONE.sub("", word) # remove tone cv = _RE_LV_C.sub("\\2\\1", cv) # switch lead vowel tone = _RE_TONE.sub(" ", word) # just tone return cv + tone -def collate(data): +def collate(data: Iterable, reverse: bool = False) -> List[str]: """ - :param list data: a list of strings + :param list data: a list of strings to be sorted + :param bool reverse: reverse flag, set to get the result in descending order :return: a list of strings, sorted alphabetically, according to Thai rules **Example**:: >>> from pythainlp.util import * >>> collate(['āđ„āļāđˆ', 'āđ€āļ›āđ‡āļ”', 'āļŦāļĄāļđ', 'āļ§āļąāļ§']) ['āđ„āļāđˆ', 'āđ€āļ›āđ‡āļ”', 'āļ§āļąāļ§', 'āļŦāļĄāļđ'] """ - return sorted(data, key=_thkey) + return sorted(data, key=_thkey, reverse=reverse) diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index f2d2ee15b..903e42fd4 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -63,7 +63,7 @@ # Conversion support for thai_strftime() -def _thai_strftime(datetime, fmt_c): +def _thai_strftime(datetime, fmt_c: str) -> str: text = "" if fmt_c == "a": # abbreviated weekday text = thai_abbr_weekdays[datetime.weekday()] @@ -73,7 +73,7 @@ def _thai_strftime(datetime, fmt_c): text = thai_abbr_months[datetime.month - 1] elif fmt_c == "B": # full month text = thai_full_months[datetime.month - 1] - elif fmt_c == "y": # # year without century + elif fmt_c == "y": # year without century text = str(datetime.year + 543)[2:4] elif fmt_c == "Y": # year with century text = str(datetime.year + 543) @@ -97,7 +97,7 @@ def _thai_strftime(datetime, fmt_c): return text -def thai_strftime(datetime, fmt, thaidigit=False): +def thai_strftime(datetime, fmt: str, thaidigit=False) -> str: """ Thai date and time string formatter @@ -126,7 +126,7 @@ def thai_strftime(datetime, fmt, thaidigit=False): If supported, we can just locale.setlocale(locale.LC_TIME, "th_TH") and then use native datetime.strftime(). - :return: Date and time spelled out in text, with month in Thai name and year in Thai Buddhist Era (BE). + :return: Date and time spelled out, with day and month names in Thai and year in Thai Buddhist Era (BE). """ thaidate_parts = [] diff --git a/pythainlp/util/digitconv.py b/pythainlp/util/digitconv.py index 16e634833..3982168d6 100644 --- a/pythainlp/util/digitconv.py +++ b/pythainlp/util/digitconv.py @@ -56,7 +56,7 @@ } -def thai_digit_to_arabic_digit(text): +def thai_digit_to_arabic_digit(text: str) -> str: """ :param str text: Text with Thai digits such as 'āđ‘', 'āđ’', 'āđ“' :return: Text with Thai digits being converted to Arabic digits such as '1', '2', '3' @@ -74,7 +74,7 @@ def thai_digit_to_arabic_digit(text): return "".join(newtext) -def arabic_digit_to_thai_digit(text): +def arabic_digit_to_thai_digit(text: str) -> str: """ :param str text: Text with Arabic digits such as '1', '2', '3' :return: Text with Arabic digits being converted to Thai digits such as 'āđ‘', 'āđ’', 'āđ“' @@ -92,7 +92,7 @@ def arabic_digit_to_thai_digit(text): return "".join(newtext) -def digit_to_text(text): +def digit_to_text(text: str) -> str: """ :param str text: Text with digits such as '1', '2', 'āđ“', 'āđ”' :return: Text with digits being spelled out in Thai @@ -113,7 +113,7 @@ def digit_to_text(text): return "".join(newtext) -def text_to_arabic_digit(text): +def text_to_arabic_digit(text: str) -> str: """ :param text: A digit spelled out in Thai :return: An Arabic digit such as '1', '2', '3' @@ -124,7 +124,7 @@ def text_to_arabic_digit(text): return _spell_digit[text] -def text_to_thai_digit(text): +def text_to_thai_digit(text: str) -> str: """ :param text: A digit spelled out in Thai :return: A Thai digit such as 'āđ‘', 'āđ’', 'āđ“' diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 8fb4abc6e..ad156715d 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -101,7 +101,7 @@ TH_EN_KEYB_PAIRS = {v: k for k, v in EN_TH_KEYB_PAIRS.items()} -def eng_to_thai(text): +def eng_to_thai(text: str) -> str: """ Correct text in one language that is incorrectly-typed with a keyboard layout in another language. (type Thai with English keyboard) @@ -113,7 +113,7 @@ def eng_to_thai(text): ) -def thai_to_eng(text): +def thai_to_eng(text: str) -> str: """ Correct text in one language that is incorrectly-typed with a keyboard layout in another language. (type Thai with English keyboard) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 66c179fb9..3e05a2c69 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -44,7 +44,7 @@ ] # āđ€āļāđ‡āļšāļžāļ§āļ āļžāļīāļĄāļžāđŒāļĨāļģāļ”āļąāļšāļœāļīāļ”āļŦāļĢāļ·āļ­āļœāļīāļ”āđāļ›āđ‰āļ™āđāļ•āđˆāļāļĨāļąāļšāđāļŠāļ”āļ‡āļœāļĨāļ–āļđāļāļ•āđ‰āļ­āļ‡ āđƒāļŦāđ‰āđ„āļ›āđ€āļ›āđ‡āļ™āđāļ›āđ‰āļ™āļ—āļĩāđˆāļ–āļđāļāļ•āđ‰āļ­āļ‡ āđ€āļŠāđˆāļ™ āđ€ + āđ€ āđ„āļ›āđ€āļ›āđ‡āļ™ āđ -def normalize(text): +def normalize(text: str) -> str: """ Thai text normalize @@ -61,7 +61,7 @@ def normalize(text): return text -def deletetone(text): +def deletetone(text: str) -> str: """ Remove tonemarks diff --git a/pythainlp/util/numtoword.py b/pythainlp/util/numtoword.py index 394984d70..68519cb79 100644 --- a/pythainlp/util/numtoword.py +++ b/pythainlp/util/numtoword.py @@ -10,7 +10,7 @@ __all__ = ["bahttext", "num_to_thaiword"] -def bahttext(number): +def bahttext(number: float) -> str: """ Converts a number to Thai text and adds a suffix of "Baht" currency. Precision will be fixed at two decimal places (0.00) to fits "Satang" unit. @@ -41,9 +41,9 @@ def bahttext(number): return ret -def num_to_thaiword(number): +def num_to_thaiword(number: int) -> str: """ - :param float number: a float number (with decimals) indicating a quantity + :param int number: a float number (with decimals) indicating a quantity :return: a text that indicates the full amount in word form, properly ending each digit with the right term. """ ret = "" diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index f6b8f3d58..70e5a9d15 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -2,11 +2,15 @@ """ Check if it is Thai text """ +import string +_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation -def is_thaichar(ch): # āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ + +def isthaichar(ch: str) -> bool: """ - Check if character is Thai + Check if a character is Thai + āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ :param str ch: input character :return: True or False @@ -17,45 +21,44 @@ def is_thaichar(ch): # āđ€āļ›āđ‡āļ™āļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄ return False -def is_thaiword(word): # āđ€āļ›āđ‡āļ™āļ„āļģāļ—āļĩāđˆāļĄāļĩāđāļ•āđˆāļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ +def isthai(word: str, ignore_chars: str = ".") -> bool: """ Check if all character is Thai + āđ€āļ›āđ‡āļ™āļ„āļģāļ—āļĩāđˆāļĄāļĩāđāļ•āđˆāļ­āļąāļāļĐāļĢāđ„āļ—āļĒāļŦāļĢāļ·āļ­āđ„āļĄāđˆ :param str word: input text + :param str ignore_chars: characters to be ignored (i.e. will be considered as Thai) :return: True or False """ + if not ignore_chars: + ignore_chars = "" + for ch in word: - if ch != "." and not is_thaichar(ch): + if ch not in ignore_chars and not isthaichar(ch): return False return True -def is_thai(text, check_all=False): +def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """ - :param str text: input string or list of strings - :param bool check_all: checks all character or not - - :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false. + :param str text: input text + :return: float, proportion of characters in the text that is Thai character """ - isthais = [] - num_isthai = 0 + if not text: + return 0 + + if not ignore_chars: + ignore_chars = "" + + num_thai = 0 + num_ignore = 0 for ch in text: - ch_val = ord(ch) - if ch_val >= 3584 and ch_val <= 3711: - num_isthai += 1 - if check_all: - isthais.append(True) - else: - if check_all: - isthais.append(False) - thai_percent = (num_isthai / len(text)) * 100 - - if check_all: - chars = list(text) - isthai_pairs = tuple(zip(chars, isthais)) - data = {"thai": thai_percent, "check_all": isthai_pairs} - else: - data = {"thai": thai_percent} - - return data + if ch in ignore_chars: + num_ignore += 1 + elif isthaichar(ch): + num_thai += 1 + + num_count = len(text) - num_ignore + + return (num_thai / num_count) * 100 diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py index d2a036370..7237d2db6 100644 --- a/pythainlp/util/thaiwordcheck.py +++ b/pythainlp/util/thaiwordcheck.py @@ -1,51 +1,76 @@ # -*- coding: utf-8 -*- -''' -From https://github.com/wannaphongcom/open-thai-nlp-document/blob/master/check_thai_word.md -''' +""" +From +https://github.com/wannaphongcom/open-thai-nlp-document/blob/master/check_thai_word.md +""" import re -def _check1(word): # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļŠāļ°āļāļ”āļ§āđˆāļēāļ•āļĢāļ‡āļ•āļēāļĄāļĄāļēāļ•āļĢāļēāđ„āļŦāļĄ - if word in ['āļ','āļ”','āļš','āļ™','āļ‡','āļĄ','āļĒ','āļ§']: - return True - else: - return False -def _check2(word): # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļāļēāļĢāļąāļ™āļ•āđŒ āļ–āđ‰āļēāļĄāļĩ āđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - if 'āđŒ' in word: - return False - else: - return True -def _check3(word): - if word in list("āļ†āļ“āļŒāļŽāļāļāļ‘āļ’āļ˜āļĻāļĐāļŽ"): # āļ–āđ‰āļēāļĄāļĩ āđāļŠāļ”āļ‡āļ§āđˆāļēāđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - return False - else: - return True -def thaicheck(word): - """ - Check is Thai Word - - :param str word: word - :return: True or False - """ - pattern = re.compile(r"[āļ-āļŽāļŪ]",re.U) # āļŠāļģāļŦāļĢāļąāļšāļ•āļĢāļ§āļˆāļŠāļ­āļšāļžāļĒāļąāļāļŠāļ™āļ° - res = re.findall(pattern,word) # āļ”āļķāļ‡āļžāļĒāļąāļāļŠāļ™āļ°āļ—āļąāļąāđ‰āļ‡āļŦāļĄāļ”āļ­āļ­āļāļĄāļē - if res==[]: - return False - elif _check1(res[len(res)-1]) or len(res)==1: - if _check2(word): - word2=list(word) - i=0 - thai=True - if word in ['āļ†āđˆāļē','āđ€āļ†āļĩāđˆāļĒāļ™','āļĻāļķāļ','āļĻāļ­āļ','āđ€āļĻāļīāļ','āđ€āļĻāļĢāđ‰āļē','āļ˜','āļ“','āļŊāļžāļ“āļŊ','āđƒāļŦāļāđˆ','āļŦāļāđ‰āļē','āļ„āļ§āļēāļĒ','āļ„āļ§āļēāļĄ','āļāļĢāļīāđˆāļ‡āđ€āļāļĢāļ‡','āļœāļĨāļī']: # āļ‚āđ‰āļ­āļĒāļāđ€āļ§āđ‰āļ™ āļ„āļģāđ€āļŦāļĨāđˆāļēāļ™āļĩāđ‰āđ€āļ›āđ‡āļ™āļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ - return True - while i bool: # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļŠāļ°āļāļ”āļ§āđˆāļēāļ•āļĢāļ‡āļ•āļēāļĄāļĄāļēāļ•āļĢāļēāđ„āļŦāļĄ + if word in ["āļ", "āļ”", "āļš", "āļ™", "āļ‡", "āļĄ", "āļĒ", "āļ§"]: + return True + return False + + +def _check2(word: str) -> bool: # āđ€āļŠāđ‡āļ„āļ•āļąāļ§āļāļēāļĢāļąāļ™āļ•āđŒ āļ–āđ‰āļēāļĄāļĩ āđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + if "āđŒ" in word: + return False + return True + + +def _check3(word: str) -> bool: + if word in list("āļ†āļ“āļŒāļŽāļāļāļ‘āļ’āļ˜āļĻāļĐāļŽ"): # āļ–āđ‰āļēāļĄāļĩ āđāļŠāļ”āļ‡āļ§āđˆāļēāđ„āļĄāđˆāđƒāļŠāđˆāļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + return False + return True + + +def thaicheck(word: str) -> bool: + """ + Check if a word is an "authentic Thai word" + + :param str word: word + :return: True or False + """ + pattern = re.compile(r"[āļ-āļŽāļŪ]", re.U) # āļŠāļģāļŦāļĢāļąāļšāļ•āļĢāļ§āļˆāļŠāļ­āļšāļžāļĒāļąāļāļŠāļ™āļ° + res = re.findall(pattern, word) # āļ”āļķāļ‡āļžāļĒāļąāļāļŠāļ™āļ°āļ—āļąāļąāđ‰āļ‡āļŦāļĄāļ”āļ­āļ­āļāļĄāļē + + if res == []: + return False + + if _check1(res[len(res) - 1]) or len(res) == 1: + if _check2(word): + word2 = list(word) + i = 0 + thai = True + if word in [ + "āļ†āđˆāļē", + "āđ€āļ†āļĩāđˆāļĒāļ™", + "āļĻāļķāļ", + "āļĻāļ­āļ", + "āđ€āļĻāļīāļ", + "āđ€āļĻāļĢāđ‰āļē", + "āļ˜", + "āļ“", + "āļŊāļžāļ“āļŊ", + "āđƒāļŦāļāđˆ", + "āļŦāļāđ‰āļē", + "āļ„āļ§āļēāļĒ", + "āļ„āļ§āļēāļĄ", + "āļāļĢāļīāđˆāļ‡āđ€āļāļĢāļ‡", + "āļœāļĨāļī", + ]: # āļ‚āđ‰āļ­āļĒāļāđ€āļ§āđ‰āļ™ āļ„āļģāđ€āļŦāļĨāđˆāļēāļ™āļĩāđ‰āđ€āļ›āđ‡āļ™āļ„āļģāđ„āļ—āļĒāđāļ—āđ‰ + return True + + while i < len(word2) and thai: + thai = _check3(word2[i]) + if not thai: + return False + i += 1 + return True + + return False + + if word in ["āļāļ°", "āļāļĢāļ°", "āļ›āļ°", "āļ›āļĢāļ°"]: + return True + + return False diff --git a/pythainlp/util/wordtonum.py b/pythainlp/util/wordtonum.py index 7521ec156..43305d329 100644 --- a/pythainlp/util/wordtonum.py +++ b/pythainlp/util/wordtonum.py @@ -6,6 +6,7 @@ https://colab.research.google.com/drive/148WNIeclf0kOU6QxKd6pcfwpSs8l-VKD#scrollTo=EuVDd0nNuI8Q """ import re +from typing import Iterable, List from pythainlp.tokenize import Tokenizer @@ -39,7 +40,7 @@ _TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS) -def _thaiword_to_num(tokens): +def _thaiword_to_num(tokens: List[str]) -> int: if not tokens: return None @@ -65,21 +66,21 @@ def _thaiword_to_num(tokens): return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:]) -def thaiword_to_num(thaiword): +def thaiword_to_num(word: str) -> int: """ - Converts a thai word to number + Converts a Thai number spellout word to actual number value - :param str thaiword: input thai word + :param str word: a Thai number spellout :return: number """ - if not thaiword: + if not word: return None tokens = [] - if isinstance(thaiword,str): - tokens = _TOKENIZER.word_tokenize(thaiword) - elif isinstance(thaiword,list) or isinstance(thaiword,tuple) or isinstance(thaiword,set) or isinstance(thaiword,frozenset): - for w in thaiword: + if isinstance(word, str): + tokens = _TOKENIZER.word_tokenize(word) + elif isinstance(word, Iterable): + for w in word: tokens.extend(_TOKENIZER.word_tokenize(w)) res = [] diff --git a/pythainlp/word_vector/__init__.py b/pythainlp/word_vector/__init__.py index d035e5395..d1da4a2e3 100644 --- a/pythainlp/word_vector/__init__.py +++ b/pythainlp/word_vector/__init__.py @@ -4,6 +4,8 @@ thai2fit - Thai word vector Code by https://github.com/cstorm125/thai2fit """ +from typing import List + import numpy as np from gensim.models import KeyedVectors from pythainlp.corpus import download as download_data @@ -13,7 +15,7 @@ WV_DIM = 300 -def _download(): +def _download() -> str: path = get_corpus_path("thai2fit_wv") if not path: download_data("thai2fit_wv") @@ -33,7 +35,7 @@ def get_model(): _MODEL = get_model() -def most_similar_cosmul(positive: list, negative: list): +def most_similar_cosmul(positive: List[str], negative: List[str]): """ Word arithmetic operations If a word is not in the vocabulary, KeyError will be raised. @@ -47,18 +49,18 @@ def most_similar_cosmul(positive: list, negative: list): return _MODEL.most_similar_cosmul(positive=positive, negative=negative) -def doesnt_match(listdata): +def doesnt_match(words: List[str]) -> str: """ Pick one word that doesn't match other words in the list If a word is not in the vocabulary, KeyError will be raised. - :param list listdata: a list of words + :param list words: a list of words :return: word that doesn't match """ - return _MODEL.doesnt_match(listdata) + return _MODEL.doesnt_match(words) -def similarity(word1, word2): +def similarity(word1: str, word2: str) -> float: """ Get cosine similarity between two words. If a word is not in the vocabulary, KeyError will be raised. @@ -70,7 +72,7 @@ def similarity(word1, word2): return _MODEL.similarity(word1, word2) -def sentence_vectorizer(text, use_mean=True): +def sentence_vectorizer(text: str, use_mean: bool = True): """ Get sentence vector from text If a word is not in the vocabulary, KeyError will be raised. diff --git a/requirements.txt b/requirements.txt index 3159b92b1..7fd66ad78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -marisa-trie -nltk>=3.2.2 dill -marisa_trie +marisa-trie nltk>=3.2.2 pytz requests diff --git a/setup.cfg b/setup.cfg index 350779304..809721c80 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0 +current_version = 2.0.2 commit = True tag = True diff --git a/setup.py b/setup.py index a47948438..879e9b93f 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ "ipa": ["epitran"], "ml": ["fastai>=1.0.38", "keras", "numpy", "torch"], "ner": ["sklearn-crfsuite"], - "thai2fit": ["gensim", "numpy","emoji"], + "thai2fit": ["emoji", "gensim", "numpy"], "thai2rom": ["keras", "numpy"], "full": [ "artagger", @@ -34,7 +34,7 @@ setup( name="pythainlp", - version="2.0", + version="2.0.2", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", @@ -54,16 +54,12 @@ "stopwords_th.txt", "syllables_th.txt", "tha-wn.db", - "new-thaidict.txt", - "negation.txt", - "provinces.csv", - "pt_tagger_1.dill", - "ud_thai-pud_pt_tagger.dill", - "ud_thai-pud_unigram_tagger.dill", - "unigram_tagger.dill", - "words_th.txt", + "thailand_provinces_th.txt", + "tnc_freq.txt", + "ud_thai_pud_pt_tagger.dill", + "ud_thai_pud_unigram_tagger.dill", "words_th_frozen_201810.txt", - "tnc_freq.txt" + "words_th.txt", ], }, include_package_data=True, @@ -77,6 +73,8 @@ "natural language processing", "text analytics", "ThaiNLP", + "text processing", + "localization", ], classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tests/__init__.py b/tests/__init__.py index 6ba23adda..e569951cd 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -20,11 +20,10 @@ tnc, ttc, wordnet, - download + download, ) from pythainlp.soundex import lk82, metasound, soundex, udom83 -from pythainlp.spell import correct, spell -from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob +from pythainlp.spell import correct, spell, NorvigSpellChecker from pythainlp.summarize import summarize from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram from pythainlp.tag.locations import tag_provinces @@ -37,7 +36,7 @@ multi_cut, newmm, dict_trie, - Tokenizer + Tokenizer, ) from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.tokenize import ( @@ -58,9 +57,9 @@ digit_to_text, eng_to_thai, find_keyword, - is_thai, - is_thaichar, - is_thaiword, + countthai, + isthai, + isthaichar, normalize, now_reign_year, num_to_thaiword, @@ -72,9 +71,9 @@ thai_strftime, thai_to_eng, thaiword_to_num, - thaicheck + thaicheck, ) -#from pythainlp.ulmfit import rm_brackets + class TestUM(unittest.TestCase): """ @@ -177,10 +176,6 @@ def test_spell(self): self.assertEqual(correct(""), "") self.assertIsNotNone(correct("āļ—āļ”āļŠāļ­āļ‡")) - self.assertIsNotNone(dictionary()) - self.assertGreaterEqual(prob("āļĄāļĩ"), 0) - self.assertIsNotNone(known(["āđ€āļāļīāļ”", "abc", ""])) - checker = NorvigSpellChecker(dict_filter="") self.assertIsNotNone(checker.dictionary()) self.assertGreaterEqual(checker.prob("āļĄāļĩ"), 0) @@ -262,7 +257,13 @@ def test_ner(self): self.assertEqual(ner.get_ner(""), []) self.assertIsNotNone(ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē")) self.assertIsNotNone(ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē", pos=False)) - self.assertIsNotNone(ner.get_ner("āļ„āļ“āļ°āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒāļ›āļĢāļ°āļĒāļļāļāļ•āđŒāđāļĨāļ°āļ§āļīāļĻāļ§āļāļĢāļĢāļĄāļĻāļēāļŠāļ•āļĢāđŒ āļ—āļĩāđˆāļ­āļĒāļđāđˆ āļĄāļŦāļēāļ§āļīāļ—āļĒāļēāļĨāļąāļĒāļ‚āļ­āļ™āđāļāđˆāļ™ āļ§āļīāļ—āļĒāļēāđ€āļ‚āļ•āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 112 āļŦāļĄāļđāđˆ 7 āļšāđ‰āļēāļ™āļŦāļ™āļ­āļ‡āđ€āļ”āļīāđˆāļ™ āļ•āļģāļšāļĨāļŦāļ™āļ­āļ‡āļāļ­āļĄāđ€āļāļēāļ° āļ­āļģāđ€āļ āļ­āđ€āļĄāļ·āļ­āļ‡ āļˆāļąāļ‡āļŦāļ§āļąāļ”āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 43000")) + self.assertIsNotNone( + ner.get_ner( + """āļ„āļ“āļ°āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒāļ›āļĢāļ°āļĒāļļāļāļ•āđŒāđāļĨāļ°āļ§āļīāļĻāļ§āļāļĢāļĢāļĄāļĻāļēāļŠāļ•āļĢāđŒ āļĄāļŦāļēāļ§āļīāļ—āļĒāļēāļĨāļąāļĒāļ‚āļ­āļ™āđāļāđˆāļ™ + āļ§āļīāļ—āļĒāļēāđ€āļ‚āļ•āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 112 āļŦāļĄāļđāđˆ 7 āļšāđ‰āļēāļ™āļŦāļ™āļ­āļ‡āđ€āļ”āļīāđˆāļ™ āļ•āļģāļšāļĨāļŦāļ™āļ­āļ‡āļāļ­āļĄāđ€āļāļēāļ° āļ­āļģāđ€āļ āļ­āđ€āļĄāļ·āļ­āļ‡ + āļˆāļąāļ‡āļŦāļ§āļąāļ”āļŦāļ™āļ­āļ‡āļ„āļēāļĒ 43000""" + ) + ) # self.assertEqual( # ner.get_ner("āđāļĄāļ§āļ—āļģāļ­āļ°āđ„āļĢāļ•āļ­āļ™āļŦāđ‰āļēāđ‚āļĄāļ‡āđ€āļŠāđ‰āļē"), # [ @@ -339,8 +340,9 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize("āļ—āļ”āļŠāļ­āļš", engine="XX")) self.assertIsNotNone(word_tokenize("āļ—āļ”āļŠāļ­āļš", engine="deepcut")) self.assertIsNotNone(word_tokenize("", engine="deepcut")) + def test_Tokenizer(self): - t_test=Tokenizer() + t_test = Tokenizer() self.assertEqual(t_test.word_tokenize(""), []) def test_word_tokenize_icu(self): @@ -399,7 +401,8 @@ def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) self.assertEqual( - sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē ", engine="whitespace"), ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē", ""] + sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē ", engine="whitespace"), + ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē", ""], ) self.assertEqual(sent_tokenize("āļĢāļąāļāļ™āđ‰āļģ āļĢāļąāļāļ›āļĨāļē "), ["āļĢāļąāļāļ™āđ‰āļģ", "āļĢāļąāļāļ›āļĨāļē"]) @@ -416,9 +419,9 @@ def test_syllable_tokenize(self): ) def test_tcc(self): - self.assertEqual(tcc.tcc(None), "") - self.assertEqual(tcc.tcc(""), "") - self.assertEqual(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), "āļ›/āļĢāļ°/āđ€āļ—/āļĻ/āđ„āļ—/āļĒ") + self.assertEqual(tcc.tcc(None), []) + self.assertEqual(tcc.tcc(""), []) + self.assertEqual(tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), ["āļ›", "āļĢāļ°", "āđ€āļ—", "āļĻ", "āđ„āļ—", "āļĒ"]) self.assertEqual(list(tcc.tcc_gen("")), []) self.assertEqual(tcc.tcc_pos(""), set()) @@ -558,20 +561,24 @@ def test_normalize(self): # ### pythainlp.util.thai - def test_is_thai(self): - self.assertEqual(is_thai("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), {"thai": 100.0}) - self.assertIsNotNone(is_thai("āđ€āļœāļ·āļ­āļ", check_all=True)) - self.assertIsNotNone(is_thai("āđ€āļœāļ·āļ­āļabc", check_all=True)) - - def test_is_thaichar(self): - self.assertEqual(is_thaichar("āļ"), True) - self.assertEqual(is_thaichar("a"), False) - self.assertEqual(is_thaichar("0"), False) - - def test_is_thaiword(self): - self.assertEqual(is_thaiword("āđ„āļ—āļĒ"), True) - self.assertEqual(is_thaiword("āļ•.āļ„."), True) - self.assertEqual(is_thaiword("āđ„āļ—āļĒ0"), False) + def test_countthai(self): + self.assertEqual(countthai(""), 0) + self.assertEqual(countthai("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"), 100.0) + self.assertEqual(countthai("(āļāļāļ•.)", ".()"), 100.0) + self.assertEqual(countthai("(āļāļāļ•.)", None), 50.0) + + def test_isthaichar(self): + self.assertEqual(isthaichar("āļ"), True) + self.assertEqual(isthaichar("a"), False) + self.assertEqual(isthaichar("0"), False) + + def test_isthai(self): + self.assertEqual(isthai("āđ„āļ—āļĒ"), True) + self.assertEqual(isthai("āđ„āļ—āļĒ0"), False) + self.assertEqual(isthai("āļ•.āļ„."), True) + self.assertEqual(isthai("(āļ•.āļ„.)"), False) + self.assertEqual(isthai("āļ•.āļ„.", ignore_chars=None), False) + self.assertEqual(isthai("(āļ•.āļ„.)", ignore_chars=".()"), True) def test_is_thaicheck(self): self.assertEqual(thaicheck("āļ•āļē"), True) @@ -608,5 +615,6 @@ def test_thai2vec(self): word_vector.doesnt_match(["āļāļĩāđˆāļ›āļļāđˆāļ™", "āļžāļĄāđˆāļē", "āđ„āļ­āļ•āļīāļĄ"]), "āđ„āļ­āļ•āļīāļĄ" ) + if __name__ == "__main__": unittest.main()